| %line | %branch | |||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| net.sf.classifier4J.summariser.SimpleSummariser$1 |
|
|
| 1 | /* |
|
| 2 | * ==================================================================== |
|
| 3 | * |
|
| 4 | * The Apache Software License, Version 1.1 |
|
| 5 | * |
|
| 6 | * Copyright (c) 2003-2005 Nick Lothian. All rights reserved. |
|
| 7 | * |
|
| 8 | * Redistribution and use in source and binary forms, with or without |
|
| 9 | * modification, are permitted provided that the following conditions |
|
| 10 | * are met: |
|
| 11 | * |
|
| 12 | * 1. Redistributions of source code must retain the above copyright |
|
| 13 | * notice, this list of conditions and the following disclaimer. |
|
| 14 | * |
|
| 15 | * 2. Redistributions in binary form must reproduce the above copyright |
|
| 16 | * notice, this list of conditions and the following disclaimer in |
|
| 17 | * the documentation and/or other materials provided with the |
|
| 18 | * distribution. |
|
| 19 | * |
|
| 20 | * 3. The end-user documentation included with the redistribution, if |
|
| 21 | * any, must include the following acknowlegement: |
|
| 22 | * "This product includes software developed by the |
|
| 23 | * developers of Classifier4J (http://classifier4j.sf.net/)." |
|
| 24 | * Alternately, this acknowlegement may appear in the software itself, |
|
| 25 | * if and wherever such third-party acknowlegements normally appear. |
|
| 26 | * |
|
| 27 | * 4. The name "Classifier4J" must not be used to endorse or promote |
|
| 28 | * products derived from this software without prior written |
|
| 29 | * permission. For written permission, please contact |
|
| 30 | * http://sourceforge.net/users/nicklothian/. |
|
| 31 | * |
|
| 32 | * 5. Products derived from this software may not be called |
|
| 33 | * "Classifier4J", nor may "Classifier4J" appear in their names |
|
| 34 | * without prior written permission. For written permission, please |
|
| 35 | * contact http://sourceforge.net/users/nicklothian/. |
|
| 36 | * |
|
| 37 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
|
| 38 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
|
| 39 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
| 40 | * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
|
| 41 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
| 42 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
| 43 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
|
| 44 | * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
| 45 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
|
| 46 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
|
| 47 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
| 48 | * SUCH DAMAGE. |
|
| 49 | * ==================================================================== |
|
| 50 | */ |
|
| 51 | ||
| 52 | package net.sf.classifier4J.summariser; |
|
| 53 | ||
| 54 | import java.util.ArrayList; |
|
| 55 | import java.util.Collections; |
|
| 56 | import java.util.Comparator; |
|
| 57 | import java.util.Iterator; |
|
| 58 | import java.util.LinkedHashSet; |
|
| 59 | import java.util.List; |
|
| 60 | import java.util.Map; |
|
| 61 | import java.util.Set; |
|
| 62 | ||
| 63 | import net.sf.classifier4J.Utilities; |
|
| 64 | ||
| 65 | public class SimpleSummariser implements ISummariser { |
|
| 66 | ||
| 67 | private Integer findMaxValue(List input) { |
|
| 68 | Collections.sort(input); |
|
| 69 | return (Integer) input.get(0); |
|
| 70 | } |
|
| 71 | ||
| 72 | ||
| 73 | protected Set getMostFrequentWords(int count, Map wordFrequencies) { |
|
| 74 | return Utilities.getMostFrequentWords(count, wordFrequencies); |
|
| 75 | } |
|
| 76 | ||
| 77 | /** |
|
| 78 | * @see net.sf.classifier4J.summariser.ISummariser#summarise(java.lang.String) |
|
| 79 | */ |
|
| 80 | public String summarise(String input, int numSentences) { |
|
| 81 | // get the frequency of each word in the input |
|
| 82 | Map wordFrequencies = Utilities.getWordFrequency(input); |
|
| 83 | ||
| 84 | // now create a set of the X most frequent words |
|
| 85 | Set mostFrequentWords = getMostFrequentWords(100, wordFrequencies); |
|
| 86 | ||
| 87 | // break the input up into sentences |
|
| 88 | // workingSentences is used for the analysis, but |
|
| 89 | // actualSentences is used in the results so that the |
|
| 90 | // capitalisation will be correct. |
|
| 91 | String[] workingSentences = Utilities.getSentences(input.toLowerCase()); |
|
| 92 | String[] actualSentences = Utilities.getSentences(input); |
|
| 93 | ||
| 94 | // iterate over the most frequent words, and add the first sentence |
|
| 95 | // that includes each word to the result |
|
| 96 | Set outputSentences = new LinkedHashSet(); |
|
| 97 | Iterator it = mostFrequentWords.iterator(); |
|
| 98 | while (it.hasNext()) { |
|
| 99 | String word = (String) it.next(); |
|
| 100 | for (int i = 0; i < workingSentences.length; i++) { |
|
| 101 | if (workingSentences[i].indexOf(word) >= 0) { |
|
| 102 | outputSentences.add(actualSentences[i]); |
|
| 103 | break; |
|
| 104 | } |
|
| 105 | if (outputSentences.size() >= numSentences) { |
|
| 106 | break; |
|
| 107 | } |
|
| 108 | } |
|
| 109 | if (outputSentences.size() >= numSentences) { |
|
| 110 | break; |
|
| 111 | } |
|
| 112 | ||
| 113 | } |
|
| 114 | ||
| 115 | List reorderedOutputSentences = reorderSentences(outputSentences, input); |
|
| 116 | ||
| 117 | StringBuffer result = new StringBuffer(""); |
|
| 118 | it = reorderedOutputSentences.iterator(); |
|
| 119 | while (it.hasNext()) { |
|
| 120 | String sentence = (String) it.next(); |
|
| 121 | result.append(sentence); |
|
| 122 | result.append("."); // This isn't always correct - perhaps it should be whatever symbol the sentence finished with |
|
| 123 | if (it.hasNext()) { |
|
| 124 | result.append(" "); |
|
| 125 | } |
|
| 126 | } |
|
| 127 | ||
| 128 | return result.toString(); |
|
| 129 | } |
|
| 130 | ||
| 131 | /** |
|
| 132 | * @param outputSentences |
|
| 133 | * @param input |
|
| 134 | * @return |
|
| 135 | */ |
|
| 136 | private List reorderSentences(Set outputSentences, final String input) { |
|
| 137 | // reorder the sentences to the order they were in the |
|
| 138 | // original text |
|
| 139 | ArrayList result = new ArrayList(outputSentences); |
|
| 140 | ||
| 141 | Collections.sort(result, new Comparator() { |
|
| 142 | 4 | public int compare(Object arg0, Object arg1) { |
| 143 | 2 | String sentence1 = (String) arg0; |
| 144 | 2 | String sentence2 = (String) arg1; |
| 145 | ||
| 146 | 2 | int indexOfSentence1 = input.indexOf(sentence1.trim()); |
| 147 | 2 | int indexOfSentence2 = input.indexOf(sentence2.trim()); |
| 148 | 2 | int result = indexOfSentence1 - indexOfSentence2; |
| 149 | ||
| 150 | 2 | return result; |
| 151 | } |
|
| 152 | ||
| 153 | }); |
|
| 154 | return result; |
|
| 155 | } |
|
| 156 | ||
| 157 | } |
| This report is generated by jcoverage, Maven and Maven JCoverage Plugin. |