SimpleHTMLTokenizer xref

View Javadoc

1   /*
2    * ====================================================================
3    * 
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2003 Nick Lothian. All rights reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution, if
21   *    any, must include the following acknowlegement:  
22   *       "This product includes software developed by the 
23   *        developers of Classifier4J (http://classifier4j.sf.net/)."
24   *    Alternately, this acknowlegement may appear in the software itself,
25   *    if and wherever such third-party acknowlegements normally appear.
26   *
27   * 4. The name "Classifier4J" must not be used to endorse or promote 
28   *    products derived from this software without prior written 
29   *    permission. For written permission, please contact   
30   *    http://sourceforge.net/users/nicklothian/.
31   *
32   * 5. Products derived from this software may not be called 
33   *    "Classifier4J", nor may "Classifier4J" appear in their names 
34   *    without prior written permission. For written permission, please 
35   *    contact http://sourceforge.net/users/nicklothian/.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   */
51  
52  package net.sf.classifier4J;
53  
54  import java.util.Stack;
55  /***
56   * <p>Simple HTML Tokenizer. Its goal is to tokenize words that would be displayed
57   * in a normal web browser.</p>
58   * 
59   * <p>It does not handle meta tags, alt or text attributes, but it does remove 
60   * CSS style definitions and javascript code.</p>
61   * 
62   * <p>It handles entity references by replacing them with a space(!!). This can be 
63   * overridden.</p> 
64   * 
65   * 
66   * @since 18 Nov 2003
67   * @author Nick Lothian
68   */
69  public class SimpleHTMLTokenizer extends DefaultTokenizer {
70  
71      /***
72       * Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default
73       */
74      public SimpleHTMLTokenizer() {
75          super();
76      }
77  
78      public SimpleHTMLTokenizer(int tokenizerConfig) {
79          super(tokenizerConfig);
80      }
81  
82      public SimpleHTMLTokenizer(String regularExpression) {
83          super(regularExpression);
84      }
85  
86      /***
87       * Replaces entity references with spaces
88       * 
89       * @param contentsWithUnresolvedEntityReferences the contents with the entity references
90       * @return the contents with the entities replaces with spaces
91       */
92      protected String resolveEntities(String contentsWithUnresolvedEntityReferences) {
93          if (contentsWithUnresolvedEntityReferences == null) {
94              throw new IllegalArgumentException("Cannot pass null");
95          }
96  
97          return contentsWithUnresolvedEntityReferences.replaceAll("&.{2,8};", " ");
98      }
99  
100     /***
101      * @see net.sf.classifier4J.ITokenizer#tokenize(java.lang.String)
102      */
103     public String[] tokenize(String input) {
104         Stack stack = new Stack();
105         Stack tagStack = new Stack();
106 
107         // iterate over the input string and parse find text that would be displayed
108         char[] chars = input.toCharArray();
109 
110         StringBuffer result = new StringBuffer();
111 
112         StringBuffer currentTagName = new StringBuffer();
113         for (int i = 0; i < chars.length; i++) {
114 
115             switch (chars[i]) {
116                 case '<' :
117                     stack.push(Boolean.TRUE);
118                     currentTagName = new StringBuffer();
119                     break;
120                 case '>' :
121                     stack.pop();
122                     if (currentTagName != null) {
123                         String currentTag = currentTagName.toString();
124 
125                         if (currentTag.startsWith("/")) {
126                             tagStack.pop();
127                         } else {
128 
129                             tagStack.push(currentTag.toLowerCase());
130                         }
131                     }
132                     break;
133                 default :
134                     if (stack.size() == 0) {
135                         String currentTag = (String) tagStack.peek();
136                         // ignore everything inside <script></script> or <style></style> tags
137                         if (currentTag != null) {
138                             if (!(currentTag.startsWith("script") || currentTag.startsWith("style"))) {
139                                 result.append(chars[i]);
140                             }
141                         } else {
142                             result.append(chars[i]);
143                         }
144 
145                     } else {
146                         currentTagName.append(chars[i]);
147                     }
148                     break;
149             }
150         }
151 
152         return super.tokenize(resolveEntities(result.toString()).trim());
153     }
154 
155 }