View Javadoc

1   /*
2    * ====================================================================
3    * 
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2003-2005 Nick Lothian. All rights reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution, if
21   *    any, must include the following acknowlegement:  
22   *       "This product includes software developed by the 
23   *        developers of Classifier4J (http://classifier4j.sf.net/)."
24   *    Alternately, this acknowlegement may appear in the software itself,
25   *    if and wherever such third-party acknowlegements normally appear.
26   *
27   * 4. The name "Classifier4J" must not be used to endorse or promote 
28   *    products derived from this software without prior written 
29   *    permission. For written permission, please contact   
30   *    http://sourceforge.net/users/nicklothian/.
31   *
32   * 5. Products derived from this software may not be called 
33   *    "Classifier4J", nor may "Classifier4J" appear in their names 
34   *    without prior written permission. For written permission, please 
35   *    contact http://sourceforge.net/users/nicklothian/.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   */
51  
52  package net.sf.classifier4J.summariser;
53  
54  import java.util.ArrayList;
55  import java.util.Collections;
56  import java.util.Comparator;
57  import java.util.Iterator;
58  import java.util.LinkedHashSet;
59  import java.util.List;
60  import java.util.Map;
61  import java.util.Set;
62  
63  import net.sf.classifier4J.Utilities;
64  
65  public class SimpleSummariser implements ISummariser {
66  
67      private Integer findMaxValue(List input) {
68          Collections.sort(input);
69          return (Integer) input.get(0);
70      }
71  
72  
73      protected Set getMostFrequentWords(int count, Map wordFrequencies) {
74          return Utilities.getMostFrequentWords(count, wordFrequencies);
75      }
76  
77      /***
78       * @see net.sf.classifier4J.summariser.ISummariser#summarise(java.lang.String)
79       */
80      public String summarise(String input, int numSentences) {
81          // get the frequency of each word in the input
82          Map wordFrequencies = Utilities.getWordFrequency(input);
83  
84          // now create a set of the X most frequent words
85          Set mostFrequentWords = getMostFrequentWords(100, wordFrequencies);
86  
87          // break the input up into sentences
88          // workingSentences is used for the analysis, but
89          // actualSentences is used in the results so that the 
90          // capitalisation will be correct.
91          String[] workingSentences = Utilities.getSentences(input.toLowerCase());
92          String[] actualSentences = Utilities.getSentences(input);
93  
94          // iterate over the most frequent words, and add the first sentence 
95          // that includes each word to the result
96          Set outputSentences = new LinkedHashSet();
97          Iterator it = mostFrequentWords.iterator();
98          while (it.hasNext()) {
99              String word = (String) it.next();
100             for (int i = 0; i < workingSentences.length; i++) {
101                 if (workingSentences[i].indexOf(word) >= 0) {
102                     outputSentences.add(actualSentences[i]);
103                     break;
104                 }
105                 if (outputSentences.size() >= numSentences) {
106                     break;
107                 }
108             }
109             if (outputSentences.size() >= numSentences) {
110                 break;
111             }
112 
113         }
114 
115         List reorderedOutputSentences = reorderSentences(outputSentences, input);
116 
117         StringBuffer result = new StringBuffer("");
118         it = reorderedOutputSentences.iterator();
119         while (it.hasNext()) {
120             String sentence = (String) it.next();
121             result.append(sentence);
122             result.append("."); // This isn't always correct - perhaps it should be whatever symbol the sentence finished with
123             if (it.hasNext()) {
124                 result.append(" ");
125             }
126         }
127 
128         return result.toString();
129     }
130 
131     /***
132      * @param outputSentences
133      * @param input
134      * @return
135      */
136     private List reorderSentences(Set outputSentences, final String input) {
137         // reorder the sentences to the order they were in the 
138         // original text
139         ArrayList result = new ArrayList(outputSentences);
140 
141         Collections.sort(result, new Comparator() {
142             public int compare(Object arg0, Object arg1) {
143                 String sentence1 = (String) arg0;
144                 String sentence2 = (String) arg1;
145 
146                 int indexOfSentence1 = input.indexOf(sentence1.trim());
147                 int indexOfSentence2 = input.indexOf(sentence2.trim());
148                 int result = indexOfSentence1 - indexOfSentence2;
149 
150                 return result;
151             }
152 
153         });
154         return result;
155     }
156 
157 }