View Javadoc

1   /*
2    * ====================================================================
3    * 
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2003 Nick Lothian. All rights reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution, if
21   *    any, must include the following acknowlegement:  
22   *       "This product includes software developed by the 
23   *        developers of Classifier4J (http://classifier4j.sf.net/)."
24   *    Alternately, this acknowlegement may appear in the software itself,
25   *    if and wherever such third-party acknowlegements normally appear.
26   *
27   * 4. The name "Classifier4J" must not be used to endorse or promote 
28   *    products derived from this software without prior written 
29   *    permission. For written permission, please contact   
30   *    http://sourceforge.net/users/nicklothian/.
31   *
32   * 5. Products derived from this software may not be called 
33   *    "Classifier4J", nor may "Classifier4J" appear in their names 
34   *    without prior written permission. For written permission, please 
35   *    contact http://sourceforge.net/users/nicklothian/.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   */
51  
52  package net.sf.classifier4J.bayesian;
53  
54  import java.io.IOException;
55  
56  import jdbm.btree.BTree;
57  import jdbm.helper.MRU;
58  import jdbm.helper.ObjectCache;
59  import jdbm.helper.StringComparator;
60  import jdbm.recman.RecordManager;
61  import net.sf.classifier4J.ICategorisedClassifier;
62  
63  import org.apache.commons.logging.Log;
64  import org.apache.commons.logging.LogFactory;
65  
66  public class JDBMWordsDataSource implements ICategorisedWordsDataSource {
67  	Log log = LogFactory.getLog(this.getClass());
68  
69  	RecordManager recordManager = null;
70  	BTree tree = null;
71  
72  	String dir = ".";
73  	static String databaseName = "wordprobs";
74  	static String tableName = "wordprobabilities";
75  
76  	public JDBMWordsDataSource() {
77  	}
78  
79  	public JDBMWordsDataSource(String directory) {
80  		this.dir = directory;
81  	}
82  
83  	public void close() {
84  		if (recordManager != null) {
85  			try {
86  				recordManager.commit();
87  			} catch (IOException e) {
88  				// do nothing				
89  			}
90  			try {
91  				recordManager.close();
92  			} catch (IOException e1) {
93  				// do nothing				
94  			}
95  		}
96  	}
97  
98  	public void open() throws IOException {
99  		recordManager = new RecordManager(dir + "/" + databaseName);
100 		ObjectCache cache = new ObjectCache(recordManager, new MRU(100));
101 
102 		long recid = recordManager.getNamedObject(tableName);
103 		if (recid != 0) {
104 			// already exists
105 			tree = BTree.load(recordManager, cache, recid);
106 		} else {
107 			// does not exist
108 			tree = new BTree(recordManager, cache, new StringComparator());
109 			recordManager.setNamedObject(tableName, tree.getRecid());
110 		}
111 	}
112 
113 	/***
114 	 * @see net.sf.classifier4J.bayesian.IWordsDataSource#addMatch(java.lang.String)
115 	 */
116 	public void addMatch(String word) {
117 		addMatch(ICategorisedClassifier.DEFAULT_CATEGORY, word);
118 	}
119 
120 	/***
121 	 * @see net.sf.classifier4J.bayesian.IWordsDataSource#addNonMatch(java.lang.String)
122 	 */
123 	public void addNonMatch(String word) {
124 		addNonMatch(ICategorisedClassifier.DEFAULT_CATEGORY, word);
125 	}
126 
127 	/***
128 	 * @see net.sf.classifier4J.bayesian.ICategorisedWordsDataSource#addMatch(java.lang.String, java.lang.String)
129 	 */
130 	public void addMatch(String category, String word) {
131 		try {
132 			WordProbability wp = getWordProbability(category, word);
133 			if (wp == null) {
134 				wp = new WordProbability(word, 1, 0);
135 			} else {
136 				wp.setMatchingCount(wp.getMatchingCount() + 1);
137 			}
138 			tree.insert(getKey(category, word), wp, true);
139 		} catch (IOException e) {
140 			log.error("Error with JDBM datasource", e);
141 			throw new RuntimeException("Error with JDBM datasource");
142 		}
143 
144 	}
145 
146 	/***
147 	 * @see net.sf.classifier4J.bayesian.ICategorisedWordsDataSource#addNonMatch(java.lang.String, java.lang.String)
148 	 */
149 	public void addNonMatch(String category, String word) {
150 		try {
151 			WordProbability wp = getWordProbability(category, word);
152 			if (wp == null) {
153 				wp = new WordProbability(word, 0, 1);
154 			} else {
155 				wp.setNonMatchingCount(wp.getNonMatchingCount() + 1);
156 			}
157 			tree.insert(getKey(category, word), wp, true);
158 		} catch (IOException e) {
159 			log.error("Error with JDBM datasource", e);
160 			throw new RuntimeException("Error with JDBM datasource");
161 		}
162 	}
163 
164 	/***
165 	 * @see net.sf.classifier4J.bayesian.IWordsDataSource#getWordProbability(java.lang.String)
166 	 */
167 	public WordProbability getWordProbability(String word) {
168 		return getWordProbability(ICategorisedClassifier.DEFAULT_CATEGORY, word);
169 	}
170 
171 	/***
172 	 * @see net.sf.classifier4J.bayesian.ICategorisedWordsDataSource#getWordProbability(java.lang.String, java.lang.String)
173 	 */
174 	public WordProbability getWordProbability(String category, String word) {
175 		try {
176 			return (WordProbability) tree.find(getKey(category,word));
177 		} catch (IOException e) {
178 			log.error("Error in JDBM datasource", e);
179 			throw new RuntimeException("Error in JDBM datasource");
180 		}
181 	}
182 
183 	/***
184 	 * 
185 	 * @param category The category, or null for the default
186 	 * @param word The word, cannot be null
187 	 * @return the key for the category and word. By default this is "category : word"
188 	 * @throws IllegalArgumentException if word is null
189 	 */
190 	protected String getKey(String category, String word) throws IllegalArgumentException {
191 		if (word == null) {
192 			throw new IllegalArgumentException("Word cannot be null");
193 		}
194 		StringBuffer result = new StringBuffer("");
195 		if (category == null) {
196 			result.append(ICategorisedClassifier.DEFAULT_CATEGORY);
197 		} else {
198 			result.append(category);
199 		}
200 		result.append(" : "); // space:space
201 		result.append(word);
202 
203 		return result.toString();
204 	}
205 
206     protected void finalize() throws Throwable {
207         close();
208     }
209 
210 
211 }