1 /*
2 * ====================================================================
3 *
4 * The Apache Software License, Version 1.1
5 *
6 * Copyright (c) 2003-2005 Nick Lothian. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. The end-user documentation included with the redistribution, if
21 * any, must include the following acknowlegement:
22 * "This product includes software developed by the
23 * developers of Classifier4J (http://classifier4j.sf.net/)."
24 * Alternately, this acknowlegement may appear in the software itself,
25 * if and wherever such third-party acknowlegements normally appear.
26 *
27 * 4. The name "Classifier4J" must not be used to endorse or promote
28 * products derived from this software without prior written
29 * permission. For written permission, please contact
30 * http://sourceforge.net/users/nicklothian/.
31 *
32 * 5. Products derived from this software may not be called
33 * "Classifier4J", nor may "Classifier4J" appear in their names
34 * without prior written permission. For written permission, please
35 * contact http://sourceforge.net/users/nicklothian/.
36 *
37 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 * SUCH DAMAGE.
49 * ====================================================================
50 */
51
52 package net.sf.classifier4J.summariser;
53
54 import java.util.ArrayList;
55 import java.util.Collections;
56 import java.util.Comparator;
57 import java.util.Iterator;
58 import java.util.LinkedHashSet;
59 import java.util.List;
60 import java.util.Map;
61 import java.util.Set;
62
63 import net.sf.classifier4J.Utilities;
64
65 public class SimpleSummariser implements ISummariser {
66
67 private Integer findMaxValue(List input) {
68 Collections.sort(input);
69 return (Integer) input.get(0);
70 }
71
72
73 protected Set getMostFrequentWords(int count, Map wordFrequencies) {
74 return Utilities.getMostFrequentWords(count, wordFrequencies);
75 }
76
77 /***
78 * @see net.sf.classifier4J.summariser.ISummariser#summarise(java.lang.String)
79 */
80 public String summarise(String input, int numSentences) {
81 // get the frequency of each word in the input
82 Map wordFrequencies = Utilities.getWordFrequency(input);
83
84 // now create a set of the X most frequent words
85 Set mostFrequentWords = getMostFrequentWords(100, wordFrequencies);
86
87 // break the input up into sentences
88 // workingSentences is used for the analysis, but
89 // actualSentences is used in the results so that the
90 // capitalisation will be correct.
91 String[] workingSentences = Utilities.getSentences(input.toLowerCase());
92 String[] actualSentences = Utilities.getSentences(input);
93
94 // iterate over the most frequent words, and add the first sentence
95 // that includes each word to the result
96 Set outputSentences = new LinkedHashSet();
97 Iterator it = mostFrequentWords.iterator();
98 while (it.hasNext()) {
99 String word = (String) it.next();
100 for (int i = 0; i < workingSentences.length; i++) {
101 if (workingSentences[i].indexOf(word) >= 0) {
102 outputSentences.add(actualSentences[i]);
103 break;
104 }
105 if (outputSentences.size() >= numSentences) {
106 break;
107 }
108 }
109 if (outputSentences.size() >= numSentences) {
110 break;
111 }
112
113 }
114
115 List reorderedOutputSentences = reorderSentences(outputSentences, input);
116
117 StringBuffer result = new StringBuffer("");
118 it = reorderedOutputSentences.iterator();
119 while (it.hasNext()) {
120 String sentence = (String) it.next();
121 result.append(sentence);
122 result.append("."); // This isn't always correct - perhaps it should be whatever symbol the sentence finished with
123 if (it.hasNext()) {
124 result.append(" ");
125 }
126 }
127
128 return result.toString();
129 }
130
131 /***
132 * @param outputSentences
133 * @param input
134 * @return
135 */
136 private List reorderSentences(Set outputSentences, final String input) {
137 // reorder the sentences to the order they were in the
138 // original text
139 ArrayList result = new ArrayList(outputSentences);
140
141 Collections.sort(result, new Comparator() {
142 public int compare(Object arg0, Object arg1) {
143 String sentence1 = (String) arg0;
144 String sentence2 = (String) arg1;
145
146 int indexOfSentence1 = input.indexOf(sentence1.trim());
147 int indexOfSentence2 = input.indexOf(sentence2.trim());
148 int result = indexOfSentence1 - indexOfSentence2;
149
150 return result;
151 }
152
153 });
154 return result;
155 }
156
157 }