unit tests coverage

Coverage report

%line %branch

net.sf.classifier4J.SimpleHTMLTokenizer
82%

87%

1
/*

2
* ====================================================================

3
*

4
* The Apache Software License, Version 1.1

5
*

6
* Copyright (c) 2003 Nick Lothian. All rights reserved.

7
*

8
* Redistribution and use in source and binary forms, with or without

9
* modification, are permitted provided that the following conditions

10
* are met:

11
*

12
* 1. Redistributions of source code must retain the above copyright

13
* notice, this list of conditions and the following disclaimer.

14
*

15
* 2. Redistributions in binary form must reproduce the above copyright

16
* notice, this list of conditions and the following disclaimer in

17
* the documentation and/or other materials provided with the

18
* distribution.

19
*

20
* 3. The end-user documentation included with the redistribution, if

21
* any, must include the following acknowlegement:

22
* "This product includes software developed by the

23
* developers of Classifier4J (http://classifier4j.sf.net/)."

24
* Alternately, this acknowlegement may appear in the software itself,

25
* if and wherever such third-party acknowlegements normally appear.

26
*

27
* 4. The name "Classifier4J" must not be used to endorse or promote

28
* products derived from this software without prior written

29
* permission. For written permission, please contact

30
* http://sourceforge.net/users/nicklothian/.

31
*

32
* 5. Products derived from this software may not be called

33
* "Classifier4J", nor may "Classifier4J" appear in their names

34
* without prior written permission. For written permission, please

35
* contact http://sourceforge.net/users/nicklothian/.

36
*

37
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED

38
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES

39
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

40
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR

41
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

42
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

43
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF

44
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

45
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

46
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

47
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

48
* SUCH DAMAGE.

49
* ====================================================================

50
*/

51

52
package net.sf.classifier4J;

53

54
import java.util.Stack;

55
/**

56
* <p>Simple HTML Tokenizer. Its goal is to tokenize words that would be displayed

57
* in a normal web browser.</p>

58
*

59
* <p>It does not handle meta tags, alt or text attributes, but it does remove

60
* CSS style definitions and javascript code.</p>

61
*

62
* <p>It handles entity references by replacing them with a space(!!). This can be

63
* overridden.</p>

64
*

65
*

66
* @since 18 Nov 2003

67
* @author Nick Lothian

68
*/

69
public class SimpleHTMLTokenizer extends DefaultTokenizer {

70

71
/**

72
* Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default

73
*/

74
public SimpleHTMLTokenizer() {

75 4
super();

76 4
}

77

78
public SimpleHTMLTokenizer(int tokenizerConfig) {

79 0
super(tokenizerConfig);

80 0
}

81

82
public SimpleHTMLTokenizer(String regularExpression) {

83 0
super(regularExpression);

84 0
}

85

86
/**

87
* Replaces entity references with spaces

88
*

89
* @param contentsWithUnresolvedEntityReferences the contents with the entity references

90
* @return the contents with the entities replaces with spaces

91
*/

92
protected String resolveEntities(String contentsWithUnresolvedEntityReferences) {

93 6
if (contentsWithUnresolvedEntityReferences == null) {

94 0
throw new IllegalArgumentException("Cannot pass null");

95
}

96

97 6
return contentsWithUnresolvedEntityReferences.replaceAll("&.{2,8};", " ");

98
}

99

100
/**

101
* @see net.sf.classifier4J.ITokenizer#tokenize(java.lang.String)

102
*/

103
public String[] tokenize(String input) {

104 2
Stack stack = new Stack();

105 2
Stack tagStack = new Stack();

106

107
// iterate over the input string and parse find text that would be displayed

108 2
char[] class="keyword">chars = input.toCharArray();

109

110 2
StringBuffer result = new StringBuffer();

111

112 2
StringBuffer currentTagName = new StringBuffer();

113 74
for (int i = 0; i < chars.length; i++) {

114

115 72
switch (chars[i]) {

116
case '<' :

117 4
stack.push(Boolean.TRUE);

118 4
currentTagName = new StringBuffer();

119 4
break;

120
case '>' :

121 4
stack.pop();

122 4
if (currentTagName != null) {

123 4
String currentTag = currentTagName.toString();

124

125 4
if (currentTag.startsWith("/")) {

126 2
tagStack.pop();

127
} else {

128

129 2
tagStack.push(currentTag.toLowerCase());

130
}

131
}

132
break;

133
default :

134 64
if (stack.size() == 0) {

135 54
String currentTag = (String) tagStack.peek();

136
// ignore everything inside <script></script> or <style></style> tags

137 54
if (currentTag != null) {

138 54
if (!(currentTag.startsWith("script") || currentTag.startsWith("style"))) {

139 54
result.append(chars[i]);

140
}

141
} else {

142 0
result.append(chars[i]);

143
}

144

145
} else {

146 10
currentTagName.append(chars[i]);

147
}

148
break;

149
}

150
}

151

152 2
return super.tokenize(resolveEntities(result.toString()).trim());

153
}

154

155
}

This report is generated by jcoverage, Maven and Maven JCoverage Plugin.

1		/*
2		* ====================================================================
3		*
4		* The Apache Software License, Version 1.1
5		*
6		* Copyright (c) 2003 Nick Lothian. All rights reserved.
7		*
8		* Redistribution and use in source and binary forms, with or without
9		* modification, are permitted provided that the following conditions
10		* are met:
11		*
12		* 1. Redistributions of source code must retain the above copyright
13		* notice, this list of conditions and the following disclaimer.
14		*
15		* 2. Redistributions in binary form must reproduce the above copyright
16		* notice, this list of conditions and the following disclaimer in
17		* the documentation and/or other materials provided with the
18		* distribution.
19		*
20		* 3. The end-user documentation included with the redistribution, if
21		* any, must include the following acknowlegement:
22		* "This product includes software developed by the
23		* developers of Classifier4J (http://classifier4j.sf.net/)."
24		* Alternately, this acknowlegement may appear in the software itself,
25		* if and wherever such third-party acknowlegements normally appear.
26		*
27		* 4. The name "Classifier4J" must not be used to endorse or promote
28		* products derived from this software without prior written
29		* permission. For written permission, please contact
30		* http://sourceforge.net/users/nicklothian/.
31		*
32		* 5. Products derived from this software may not be called
33		* "Classifier4J", nor may "Classifier4J" appear in their names
34		* without prior written permission. For written permission, please
35		* contact http://sourceforge.net/users/nicklothian/.
36		*
37		* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38		* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39		* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40		* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41		* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42		* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43		* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44		* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45		* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46		* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47		* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48		* SUCH DAMAGE.
49		* ====================================================================
50		*/
51
52		package net.sf.classifier4J;
53
54		import java.util.Stack;
55		/**
56		* <p>Simple HTML Tokenizer. Its goal is to tokenize words that would be displayed
57		* in a normal web browser.</p>
58		*
59		* <p>It does not handle meta tags, alt or text attributes, but it does remove
60		* CSS style definitions and javascript code.</p>
61		*
62		* <p>It handles entity references by replacing them with a space(!!). This can be
63		* overridden.</p>
64		*
65		*
66		* @since 18 Nov 2003
67		* @author Nick Lothian
68		*/
69		public class SimpleHTMLTokenizer extends DefaultTokenizer {
70
71		/**
72		* Constructor that using the BREAK_ON_WORD_BREAKS tokenizer config by default
73		*/
74		public SimpleHTMLTokenizer() {
75	4	super();
76	4	}
77
78		public SimpleHTMLTokenizer(int tokenizerConfig) {
79	0	super(tokenizerConfig);
80	0	}
81
82		public SimpleHTMLTokenizer(String regularExpression) {
83	0	super(regularExpression);
84	0	}
85
86		/**
87		* Replaces entity references with spaces
88		*
89		* @param contentsWithUnresolvedEntityReferences the contents with the entity references
90		* @return the contents with the entities replaces with spaces
91		*/
92		protected String resolveEntities(String contentsWithUnresolvedEntityReferences) {
93	6	if (contentsWithUnresolvedEntityReferences == null) {
94	0	throw new IllegalArgumentException("Cannot pass null");
95		}
96
97	6	return contentsWithUnresolvedEntityReferences.replaceAll("&.{2,8};", " ");
98		}
99
100		/**
101		* @see net.sf.classifier4J.ITokenizer#tokenize(java.lang.String)
102		*/
103		public String[] tokenize(String input) {
104	2	Stack stack = new Stack();
105	2	Stack tagStack = new Stack();
106
107		// iterate over the input string and parse find text that would be displayed
108	2	char[] class="keyword">chars = input.toCharArray();
109
110	2	StringBuffer result = new StringBuffer();
111
112	2	StringBuffer currentTagName = new StringBuffer();
113	74	for (int i = 0; i < chars.length; i++) {
114
115	72	switch (chars[i]) {
116		case '<' :
117	4	stack.push(Boolean.TRUE);
118	4	currentTagName = new StringBuffer();
119	4	break;
120		case '>' :
121	4	stack.pop();
122	4	if (currentTagName != null) {
123	4	String currentTag = currentTagName.toString();
124
125	4	if (currentTag.startsWith("/")) {
126	2	tagStack.pop();
127		} else {
128
129	2	tagStack.push(currentTag.toLowerCase());
130		}
131		}
132		break;
133		default :
134	64	if (stack.size() == 0) {
135	54	String currentTag = (String) tagStack.peek();
136		// ignore everything inside <script></script> or <style></style> tags
137	54	if (currentTag != null) {
138	54	if (!(currentTag.startsWith("script") \|\| currentTag.startsWith("style"))) {
139	54	result.append(chars[i]);
140		}
141		} else {
142	0	result.append(chars[i]);
143		}
144
145		} else {
146	10	currentTagName.append(chars[i]);
147		}
148		break;
149		}
150		}
151
152	2	return super.tokenize(resolveEntities(result.toString()).trim());
153		}
154
155		}