1 /*
2 * ====================================================================
3 *
4 * The Apache Software License, Version 1.1
5 *
6 * Copyright (c) 2003 Nick Lothian. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. The end-user documentation included with the redistribution, if
21 * any, must include the following acknowlegement:
22 * "This product includes software developed by the
23 * developers of Classifier4J (http://classifier4j.sf.net/)."
24 * Alternately, this acknowlegement may appear in the software itself,
25 * if and wherever such third-party acknowlegements normally appear.
26 *
27 * 4. The name "Classifier4J" must not be used to endorse or promote
28 * products derived from this software without prior written
29 * permission. For written permission, please contact
30 * http://sourceforge.net/users/nicklothian/.
31 *
32 * 5. Products derived from this software may not be called
33 * "Classifier4J", nor may "Classifier4J" appear in their names
34 * without prior written permission. For written permission, please
35 * contact http://sourceforge.net/users/nicklothian/.
36 *
37 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 * SUCH DAMAGE.
49 * ====================================================================
50 */
51
52 package net.sf.classifier4J;
53
54 import java.util.Stack;
55 /***
56 * <p>Simple HTML Tokenizer. Its goal is to tokenize words that would be displayed
57 * in a normal web browser.</p>
58 *
59 * <p>It does not handle meta tags, alt or text attributes, but it does remove
60 * CSS style definitions and javascript code.</p>
61 *
62 * <p>It handles entity references by replacing them with a space(!!). This can be
63 * overridden.</p>
64 *
65 *
66 * @since 18 Nov 2003
67 * @author Nick Lothian
68 */
69 public class SimpleHTMLTokenizer extends DefaultTokenizer {
70
71 /***
72 * Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default
73 */
74 public SimpleHTMLTokenizer() {
75 super();
76 }
77
78 public SimpleHTMLTokenizer(int tokenizerConfig) {
79 super(tokenizerConfig);
80 }
81
82 public SimpleHTMLTokenizer(String regularExpression) {
83 super(regularExpression);
84 }
85
86 /***
87 * Replaces entity references with spaces
88 *
89 * @param contentsWithUnresolvedEntityReferences the contents with the entity references
90 * @return the contents with the entities replaces with spaces
91 */
92 protected String resolveEntities(String contentsWithUnresolvedEntityReferences) {
93 if (contentsWithUnresolvedEntityReferences == null) {
94 throw new IllegalArgumentException("Cannot pass null");
95 }
96
97 return contentsWithUnresolvedEntityReferences.replaceAll("&.{2,8};", " ");
98 }
99
100 /***
101 * @see net.sf.classifier4J.ITokenizer#tokenize(java.lang.String)
102 */
103 public String[] tokenize(String input) {
104 Stack stack = new Stack();
105 Stack tagStack = new Stack();
106
107 // iterate over the input string and parse find text that would be displayed
108 char[] chars = input.toCharArray();
109
110 StringBuffer result = new StringBuffer();
111
112 StringBuffer currentTagName = new StringBuffer();
113 for (int i = 0; i < chars.length; i++) {
114
115 switch (chars[i]) {
116 case '<' :
117 stack.push(Boolean.TRUE);
118 currentTagName = new StringBuffer();
119 break;
120 case '>' :
121 stack.pop();
122 if (currentTagName != null) {
123 String currentTag = currentTagName.toString();
124
125 if (currentTag.startsWith("/")) {
126 tagStack.pop();
127 } else {
128
129 tagStack.push(currentTag.toLowerCase());
130 }
131 }
132 break;
133 default :
134 if (stack.size() == 0) {
135 String currentTag = (String) tagStack.peek();
136 // ignore everything inside <script></script> or <style></style> tags
137 if (currentTag != null) {
138 if (!(currentTag.startsWith("script") || currentTag.startsWith("style"))) {
139 result.append(chars[i]);
140 }
141 } else {
142 result.append(chars[i]);
143 }
144
145 } else {
146 currentTagName.append(chars[i]);
147 }
148 break;
149 }
150 }
151
152 return super.tokenize(resolveEntities(result.toString()).trim());
153 }
154
155 }