1 /* 2 * ==================================================================== 3 * 4 * The Apache Software License, Version 1.1 5 * 6 * Copyright (c) 2003 Nick Lothian. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 20 * 3. The end-user documentation included with the redistribution, if 21 * any, must include the following acknowlegement: 22 * "This product includes software developed by the 23 * developers of Classifier4J (http://classifier4j.sf.net/)." 24 * Alternately, this acknowlegement may appear in the software itself, 25 * if and wherever such third-party acknowlegements normally appear. 26 * 27 * 4. The name "Classifier4J" must not be used to endorse or promote 28 * products derived from this software without prior written 29 * permission. For written permission, please contact 30 * http://sourceforge.net/users/nicklothian/. 31 * 32 * 5. Products derived from this software may not be called 33 * "Classifier4J", nor may "Classifier4J" appear in their names 34 * without prior written permission. For written permission, please 35 * contact http://sourceforge.net/users/nicklothian/. 36 * 37 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 38 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 39 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 40 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR 41 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 44 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 45 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 46 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 47 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 48 * SUCH DAMAGE. 49 * ==================================================================== 50 */ 51 52 package net.sf.classifier4J; 53 54 import java.util.Arrays; 55 56 import net.sf.classifier4J.util.ToStringBuilder; 57 58 59 /*** 60 * @author Nick Lothian 61 * @author Peter Leschev 62 */ 63 public class DefaultStopWordsProvider implements IStopWordProvider { 64 // This array is sorted in the constructor 65 private String[] stopWords = { "a", "and", "the", "me", "i", "of", "if", "it", "is", "they", "there", "but", "or", "to", "this", "you", "in", "your", "on", "for", "as", "are", "that", "with", "have", "be", "at", "or", "was", "so", "out", "not", "an" }; 66 private String[] sortedStopWords = null; 67 68 public DefaultStopWordsProvider() { 69 sortedStopWords = getStopWords(); 70 Arrays.sort(sortedStopWords); 71 } 72 73 /*** 74 * getter method which can be overridden to 75 * supply the stop words. The array returned by this 76 * method is sorted and then used internally 77 * 78 * @return the array of stop words 79 */ 80 public String[] getStopWords() { 81 return stopWords; 82 } 83 84 /*** 85 * @see net.sf.classifier4J.IStopWordProvider#isStopWord(java.lang.String) 86 */ 87 public boolean isStopWord(String word) { 88 if (word == null || "".equals(word)) { 89 return false; 90 } else { 91 // search the sorted array for the word, converted to lowercase 92 // if it is found, the index will be >= 0 93 return (Arrays.binarySearch(sortedStopWords, word.toLowerCase()) >= 0); 94 } 95 } 96 97 public String toString() { 98 return new ToStringBuilder(this).append("stopWords.size()", sortedStopWords.length).toString(); 99 } 100 }