1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 package net.sf.classifier4J;
53
54 import java.util.Stack;
55 /***
56 * <p>Simple HTML Tokenizer. Its goal is to tokenize words that would be displayed
57 * in a normal web browser.</p>
58 *
59 * <p>It does not handle meta tags, alt or text attributes, but it does remove
60 * CSS style definitions and javascript code.</p>
61 *
62 * <p>It handles entity references by replacing them with a space(!!). This can be
63 * overridden.</p>
64 *
65 *
66 * @since 18 Nov 2003
67 * @author Nick Lothian
68 */
69 public class SimpleHTMLTokenizer extends DefaultTokenizer {
70
71 /***
72 * Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default
73 */
74 public SimpleHTMLTokenizer() {
75 super();
76 }
77
78 public SimpleHTMLTokenizer(int tokenizerConfig) {
79 super(tokenizerConfig);
80 }
81
82 public SimpleHTMLTokenizer(String regularExpression) {
83 super(regularExpression);
84 }
85
86 /***
87 * Replaces entity references with spaces
88 *
89 * @param contentsWithUnresolvedEntityReferences the contents with the entity references
90 * @return the contents with the entities replaces with spaces
91 */
92 protected String resolveEntities(String contentsWithUnresolvedEntityReferences) {
93 if (contentsWithUnresolvedEntityReferences == null) {
94 throw new IllegalArgumentException("Cannot pass null");
95 }
96
97 return contentsWithUnresolvedEntityReferences.replaceAll("&.{2,8};", " ");
98 }
99
100 /***
101 * @see net.sf.classifier4J.ITokenizer#tokenize(java.lang.String)
102 */
103 public String[] tokenize(String input) {
104 Stack stack = new Stack();
105 Stack tagStack = new Stack();
106
107
108 char[] chars = input.toCharArray();
109
110 StringBuffer result = new StringBuffer();
111
112 StringBuffer currentTagName = new StringBuffer();
113 for (int i = 0; i < chars.length; i++) {
114
115 switch (chars[i]) {
116 case '<' :
117 stack.push(Boolean.TRUE);
118 currentTagName = new StringBuffer();
119 break;
120 case '>' :
121 stack.pop();
122 if (currentTagName != null) {
123 String currentTag = currentTagName.toString();
124
125 if (currentTag.startsWith("/")) {
126 tagStack.pop();
127 } else {
128
129 tagStack.push(currentTag.toLowerCase());
130 }
131 }
132 break;
133 default :
134 if (stack.size() == 0) {
135 String currentTag = (String) tagStack.peek();
136
137 if (currentTag != null) {
138 if (!(currentTag.startsWith("script") || currentTag.startsWith("style"))) {
139 result.append(chars[i]);
140 }
141 } else {
142 result.append(chars[i]);
143 }
144
145 } else {
146 currentTagName.append(chars[i]);
147 }
148 break;
149 }
150 }
151
152 return super.tokenize(resolveEntities(result.toString()).trim());
153 }
154
155 }