View Javadoc

1   /*
2    * ====================================================================
3    * 
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2003 Nick Lothian. All rights reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution, if
21   *    any, must include the following acknowlegement:  
22   *       "This product includes software developed by the 
23   *        developers of Classifier4J (http://classifier4j.sf.net/)."
24   *    Alternately, this acknowlegement may appear in the software itself,
25   *    if and wherever such third-party acknowlegements normally appear.
26   *
27   * 4. The name "Classifier4J" must not be used to endorse or promote 
28   *    products derived from this software without prior written 
29   *    permission. For written permission, please contact   
30   *    http://sourceforge.net/users/nicklothian/.
31   *
32   * 5. Products derived from this software may not be called 
33   *    "Classifier4J", nor may "Classifier4J" appear in their names 
34   *    without prior written permission. For written permission, please 
35   *    contact http://sourceforge.net/users/nicklothian/.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   */
51  
52  package net.sf.classifier4J;
53  
54  import java.io.BufferedReader;
55  import java.io.IOException;
56  import java.io.InputStreamReader;
57  import java.util.ArrayList;
58  import java.util.Arrays;
59  
60  import net.sf.classifier4J.util.*;
61  
62  public class CustomizableStopWordProvider implements IStopWordProvider {
63  
64      private Resource resource;
65      private String[] words;
66  
67      public static final String DEFAULT_STOPWORD_PROVIDER_RESOURCENAME = "defaultStopWords.txt";
68  
69      /***
70       * 
71       * @param filename Identifies the name of a textfile on the classpath that contains
72       * a list of stop words, one on each line
73       */
74      public CustomizableStopWordProvider(String resourcename) throws IOException {
75          resource = new Resource(resourcename);
76          
77          init();
78      }
79  
80      public CustomizableStopWordProvider() throws IOException {
81          this(DEFAULT_STOPWORD_PROVIDER_RESOURCENAME);
82      }
83  
84      protected void init() throws IOException {
85          ArrayList wordsLst = new ArrayList();
86          BufferedReader reader = new BufferedReader(new InputStreamReader(resource.getInputStream()));
87          
88          String word;
89          while ((word = reader.readLine()) != null) {
90              wordsLst.add(word.trim());
91          }
92          
93          words = (String[]) wordsLst.toArray(new String[wordsLst.size()]);
94          
95          Arrays.sort(words);
96      }
97  
98      /***
99       * @see net.sf.classifier4J.IStopWordProvider#isStopWord(java.lang.String)
100      */
101     public boolean isStopWord(String word) {
102         return (Arrays.binarySearch(words, word) >= 0);
103     }
104 
105 }