| %line | %branch | |||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| net.sf.classifier4J.SimpleHTMLTokenizer |
|
|
| 1 | /* |
|
| 2 | * ==================================================================== |
|
| 3 | * |
|
| 4 | * The Apache Software License, Version 1.1 |
|
| 5 | * |
|
| 6 | * Copyright (c) 2003 Nick Lothian. All rights reserved. |
|
| 7 | * |
|
| 8 | * Redistribution and use in source and binary forms, with or without |
|
| 9 | * modification, are permitted provided that the following conditions |
|
| 10 | * are met: |
|
| 11 | * |
|
| 12 | * 1. Redistributions of source code must retain the above copyright |
|
| 13 | * notice, this list of conditions and the following disclaimer. |
|
| 14 | * |
|
| 15 | * 2. Redistributions in binary form must reproduce the above copyright |
|
| 16 | * notice, this list of conditions and the following disclaimer in |
|
| 17 | * the documentation and/or other materials provided with the |
|
| 18 | * distribution. |
|
| 19 | * |
|
| 20 | * 3. The end-user documentation included with the redistribution, if |
|
| 21 | * any, must include the following acknowlegement: |
|
| 22 | * "This product includes software developed by the |
|
| 23 | * developers of Classifier4J (http://classifier4j.sf.net/)." |
|
| 24 | * Alternately, this acknowlegement may appear in the software itself, |
|
| 25 | * if and wherever such third-party acknowlegements normally appear. |
|
| 26 | * |
|
| 27 | * 4. The name "Classifier4J" must not be used to endorse or promote |
|
| 28 | * products derived from this software without prior written |
|
| 29 | * permission. For written permission, please contact |
|
| 30 | * http://sourceforge.net/users/nicklothian/. |
|
| 31 | * |
|
| 32 | * 5. Products derived from this software may not be called |
|
| 33 | * "Classifier4J", nor may "Classifier4J" appear in their names |
|
| 34 | * without prior written permission. For written permission, please |
|
| 35 | * contact http://sourceforge.net/users/nicklothian/. |
|
| 36 | * |
|
| 37 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
|
| 38 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
|
| 39 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
| 40 | * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
|
| 41 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
| 42 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
| 43 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
|
| 44 | * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
| 45 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
|
| 46 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
|
| 47 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
| 48 | * SUCH DAMAGE. |
|
| 49 | * ==================================================================== |
|
| 50 | */ |
|
| 51 | ||
| 52 | package net.sf.classifier4J; |
|
| 53 | ||
| 54 | import java.util.Stack; |
|
| 55 | /** |
|
| 56 | * <p>Simple HTML Tokenizer. Its goal is to tokenize words that would be displayed |
|
| 57 | * in a normal web browser.</p> |
|
| 58 | * |
|
| 59 | * <p>It does not handle meta tags, alt or text attributes, but it does remove |
|
| 60 | * CSS style definitions and javascript code.</p> |
|
| 61 | * |
|
| 62 | * <p>It handles entity references by replacing them with a space(!!). This can be |
|
| 63 | * overridden.</p> |
|
| 64 | * |
|
| 65 | * |
|
| 66 | * @since 18 Nov 2003 |
|
| 67 | * @author Nick Lothian |
|
| 68 | */ |
|
| 69 | public class SimpleHTMLTokenizer extends DefaultTokenizer { |
|
| 70 | ||
| 71 | /** |
|
| 72 | * Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default |
|
| 73 | */ |
|
| 74 | public SimpleHTMLTokenizer() { |
|
| 75 | 4 | super(); |
| 76 | 4 | } |
| 77 | ||
| 78 | public SimpleHTMLTokenizer(int tokenizerConfig) { |
|
| 79 | 0 | super(tokenizerConfig); |
| 80 | 0 | } |
| 81 | ||
| 82 | public SimpleHTMLTokenizer(String regularExpression) { |
|
| 83 | 0 | super(regularExpression); |
| 84 | 0 | } |
| 85 | ||
| 86 | /** |
|
| 87 | * Replaces entity references with spaces |
|
| 88 | * |
|
| 89 | * @param contentsWithUnresolvedEntityReferences the contents with the entity references |
|
| 90 | * @return the contents with the entities replaces with spaces |
|
| 91 | */ |
|
| 92 | protected String resolveEntities(String contentsWithUnresolvedEntityReferences) { |
|
| 93 | 6 | if (contentsWithUnresolvedEntityReferences == null) { |
| 94 | 0 | throw new IllegalArgumentException("Cannot pass null"); |
| 95 | } |
|
| 96 | ||
| 97 | 6 | return contentsWithUnresolvedEntityReferences.replaceAll("&.{2,8};", " "); |
| 98 | } |
|
| 99 | ||
| 100 | /** |
|
| 101 | * @see net.sf.classifier4J.ITokenizer#tokenize(java.lang.String) |
|
| 102 | */ |
|
| 103 | public String[] tokenize(String input) { |
|
| 104 | 2 | Stack stack = new Stack(); |
| 105 | 2 | Stack tagStack = new Stack(); |
| 106 | ||
| 107 | // iterate over the input string and parse find text that would be displayed |
|
| 108 | 2 | char[] class="keyword">chars = input.toCharArray(); |
| 109 | ||
| 110 | 2 | StringBuffer result = new StringBuffer(); |
| 111 | ||
| 112 | 2 | StringBuffer currentTagName = new StringBuffer(); |
| 113 | 74 | for (int i = 0; i < chars.length; i++) { |
| 114 | ||
| 115 | 72 | switch (chars[i]) { |
| 116 | case '<' : |
|
| 117 | 4 | stack.push(Boolean.TRUE); |
| 118 | 4 | currentTagName = new StringBuffer(); |
| 119 | 4 | break; |
| 120 | case '>' : |
|
| 121 | 4 | stack.pop(); |
| 122 | 4 | if (currentTagName != null) { |
| 123 | 4 | String currentTag = currentTagName.toString(); |
| 124 | ||
| 125 | 4 | if (currentTag.startsWith("/")) { |
| 126 | 2 | tagStack.pop(); |
| 127 | } else { |
|
| 128 | ||
| 129 | 2 | tagStack.push(currentTag.toLowerCase()); |
| 130 | } |
|
| 131 | } |
|
| 132 | break; |
|
| 133 | default : |
|
| 134 | 64 | if (stack.size() == 0) { |
| 135 | 54 | String currentTag = (String) tagStack.peek(); |
| 136 | // ignore everything inside <script></script> or <style></style> tags |
|
| 137 | 54 | if (currentTag != null) { |
| 138 | 54 | if (!(currentTag.startsWith("script") || currentTag.startsWith("style"))) { |
| 139 | 54 | result.append(chars[i]); |
| 140 | } |
|
| 141 | } else { |
|
| 142 | 0 | result.append(chars[i]); |
| 143 | } |
|
| 144 | ||
| 145 | } else { |
|
| 146 | 10 | currentTagName.append(chars[i]); |
| 147 | } |
|
| 148 | break; |
|
| 149 | } |
|
| 150 | } |
|
| 151 | ||
| 152 | 2 | return super.tokenize(resolveEntities(result.toString()).trim()); |
| 153 | } |
|
| 154 | ||
| 155 | } |
| This report is generated by jcoverage, Maven and Maven JCoverage Plugin. |