1 /* 2 * ==================================================================== 3 * 4 * The Apache Software License, Version 1.1 5 * 6 * Copyright (c) 2003-2005 Nick Lothian. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 20 * 3. The end-user documentation included with the redistribution, if 21 * any, must include the following acknowlegement: 22 * "This product includes software developed by the 23 * developers of Classifier4J (http://classifier4j.sf.net/)." 24 * Alternately, this acknowlegement may appear in the software itself, 25 * if and wherever such third-party acknowlegements normally appear. 26 * 27 * 4. The name "Classifier4J" must not be used to endorse or promote 28 * products derived from this software without prior written 29 * permission. For written permission, please contact 30 * http://sourceforge.net/users/nicklothian/. 31 * 32 * 5. Products derived from this software may not be called 33 * "Classifier4J", nor may "Classifier4J" appear in their names 34 * without prior written permission. For written permission, please 35 * contact http://sourceforge.net/users/nicklothian/. 36 * 37 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 38 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 39 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 40 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR 41 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 44 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 45 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 46 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 47 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 48 * SUCH DAMAGE. 49 * ==================================================================== 50 */ 51 52 package net.sf.classifier4J.summariser; 53 54 import java.util.ArrayList; 55 import java.util.Collections; 56 import java.util.Comparator; 57 import java.util.Iterator; 58 import java.util.LinkedHashSet; 59 import java.util.List; 60 import java.util.Map; 61 import java.util.Set; 62 63 import net.sf.classifier4J.Utilities; 64 65 public class SimpleSummariser implements ISummariser { 66 67 private Integer findMaxValue(List input) { 68 Collections.sort(input); 69 return (Integer) input.get(0); 70 } 71 72 73 protected Set getMostFrequentWords(int count, Map wordFrequencies) { 74 return Utilities.getMostFrequentWords(count, wordFrequencies); 75 } 76 77 /*** 78 * @see net.sf.classifier4J.summariser.ISummariser#summarise(java.lang.String) 79 */ 80 public String summarise(String input, int numSentences) { 81 // get the frequency of each word in the input 82 Map wordFrequencies = Utilities.getWordFrequency(input); 83 84 // now create a set of the X most frequent words 85 Set mostFrequentWords = getMostFrequentWords(100, wordFrequencies); 86 87 // break the input up into sentences 88 // workingSentences is used for the analysis, but 89 // actualSentences is used in the results so that the 90 // capitalisation will be correct. 91 String[] workingSentences = Utilities.getSentences(input.toLowerCase()); 92 String[] actualSentences = Utilities.getSentences(input); 93 94 // iterate over the most frequent words, and add the first sentence 95 // that includes each word to the result 96 Set outputSentences = new LinkedHashSet(); 97 Iterator it = mostFrequentWords.iterator(); 98 while (it.hasNext()) { 99 String word = (String) it.next(); 100 for (int i = 0; i < workingSentences.length; i++) { 101 if (workingSentences[i].indexOf(word) >= 0) { 102 outputSentences.add(actualSentences[i]); 103 break; 104 } 105 if (outputSentences.size() >= numSentences) { 106 break; 107 } 108 } 109 if (outputSentences.size() >= numSentences) { 110 break; 111 } 112 113 } 114 115 List reorderedOutputSentences = reorderSentences(outputSentences, input); 116 117 StringBuffer result = new StringBuffer(""); 118 it = reorderedOutputSentences.iterator(); 119 while (it.hasNext()) { 120 String sentence = (String) it.next(); 121 result.append(sentence); 122 result.append("."); // This isn't always correct - perhaps it should be whatever symbol the sentence finished with 123 if (it.hasNext()) { 124 result.append(" "); 125 } 126 } 127 128 return result.toString(); 129 } 130 131 /*** 132 * @param outputSentences 133 * @param input 134 * @return 135 */ 136 private List reorderSentences(Set outputSentences, final String input) { 137 // reorder the sentences to the order they were in the 138 // original text 139 ArrayList result = new ArrayList(outputSentences); 140 141 Collections.sort(result, new Comparator() { 142 public int compare(Object arg0, Object arg1) { 143 String sentence1 = (String) arg0; 144 String sentence2 = (String) arg1; 145 146 int indexOfSentence1 = input.indexOf(sentence1.trim()); 147 int indexOfSentence2 = input.indexOf(sentence2.trim()); 148 int result = indexOfSentence1 - indexOfSentence2; 149 150 return result; 151 } 152 153 }); 154 return result; 155 } 156 157 }