View Javadoc

1   /*
2    * ====================================================================
3    * 
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2003 Nick Lothian. All rights reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution, if
21   *    any, must include the following acknowlegement:  
22   *       "This product includes software developed by the 
23   *        developers of Classifier4J (http://classifier4j.sf.net/)."
24   *    Alternately, this acknowlegement may appear in the software itself,
25   *    if and wherever such third-party acknowlegements normally appear.
26   *
27   * 4. The name "Classifier4J" must not be used to endorse or promote 
28   *    products derived from this software without prior written 
29   *    permission. For written permission, please contact   
30   *    http://sourceforge.net/users/nicklothian/.
31   *
32   * 5. Products derived from this software may not be called 
33   *    "Classifier4J", nor may "Classifier4J" appear in their names 
34   *    without prior written permission. For written permission, please 
35   *    contact http://sourceforge.net/users/nicklothian/.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   */
51  
52  package net.sf.classifier4J.bayesian;
53  
54  import java.sql.Connection;
55  import java.sql.DatabaseMetaData;
56  import java.sql.PreparedStatement;
57  import java.sql.ResultSet;
58  import java.sql.SQLException;
59  import java.sql.Statement;
60  
61  import org.apache.commons.logging.Log;
62  import org.apache.commons.logging.LogFactory;
63  
64  import net.sf.classifier4J.ICategorisedClassifier;
65  
66  /***
67   * 
68   * <p>A JDBC based datasource. It requires a table of the following structure (tested in MySQL 4):
69   * 
70   * <pre>
71   * CREATE TABLE word_probability (
72   *	word			VARCHAR(255) NOT NULL,
73   *	category		VARCHAR(20) NOT NULL,
74   *	match_count		INT DEFAULT 0 NOT NULL,
75   *	nonmatch_count	INT DEFAULT 0 NOT NULL,
76   *	PRIMARY KEY(word, category)
77   * )
78   * </pre>
79   *
80   *</p>
81   *<p>It will truncate any word longer than 255 characters to 255 characters</p>
82   *
83   * @author Nick Lothian
84   * @author Peter Leschev
85   *  
86   */
87  public class JDBCWordsDataSource implements ICategorisedWordsDataSource {
88  
89      IJDBCConnectionManager connectionManager;
90  
91      private Log log = LogFactory.getLog(this.getClass());
92  
93      /***
94       * Create a JDBCWordsDataSource using the DEFAULT_CATEGORY ("DEFAULT")
95       * 
96       * @param cm The connection manager to use
97       */
98      public JDBCWordsDataSource(IJDBCConnectionManager cm) throws WordsDataSourceException {
99          this.connectionManager = cm;
100         createTable();
101     }
102 
103     public WordProbability getWordProbability(String category, String word) throws WordsDataSourceException {
104 
105         WordProbability wp = null;
106         String method = "getWordProbability()";
107 
108         int matchingCount = 0;
109         int nonMatchingCount = 0;
110 
111         Connection conn = null;
112         try {
113             conn = connectionManager.getConnection();
114             PreparedStatement ps = conn.prepareStatement("SELECT match_count, nonmatch_count FROM word_probability WHERE word = ? AND category = ?");
115             ps.setString(1, word);
116             ps.setString(2, category);
117 
118             ResultSet rs = ps.executeQuery();
119             if (rs.next()) {
120                 matchingCount = rs.getInt("match_count");
121                 nonMatchingCount = rs.getInt("nonmatch_count");
122             }
123             wp = new WordProbability(word, matchingCount, nonMatchingCount);
124 
125         } catch (SQLException e) {
126             throw new WordsDataSourceException("Problem obtaining WordProbability from database", e);
127         } finally {
128             if (conn != null) {
129                 try {
130                     connectionManager.returnConnection(conn);
131                 } catch (SQLException e1) {
132                     // ignore
133                 }
134             }
135         }
136 
137         if (log.isDebugEnabled()) {
138             log.debug(method + " WordProbability loaded [" + wp + "]");
139         }
140 
141         return wp;
142 
143     }
144 
145     public WordProbability getWordProbability(String word) throws WordsDataSourceException {
146         return getWordProbability(ICategorisedClassifier.DEFAULT_CATEGORY, word);
147     }
148 
149     private void updateWordProbability(String category, String word, boolean isMatch) throws WordsDataSourceException {
150         String fieldname = "nonmatch_count";
151         if (isMatch) {
152             fieldname = "match_count";
153         }
154 
155         // truncate word at 255 characters
156         if (word.length() > 255) {
157             word = word.substring(0, 254);
158         }
159 
160         Connection conn = null;
161         try {
162             conn = connectionManager.getConnection();
163             PreparedStatement insertStatement = conn.prepareStatement("INSERT INTO word_probability (word, category) VALUES (?, ?)");
164             PreparedStatement selectStatement = conn.prepareStatement("SELECT 1 FROM word_probability WHERE word = ? AND category = ?");
165             PreparedStatement updateStatement = conn.prepareStatement("UPDATE word_probability SET " + fieldname + " = " + fieldname + " + 1 WHERE word = ? AND category = ?");
166 
167             selectStatement.setString(1, word);
168             selectStatement.setString(2, category);
169             ResultSet rs = selectStatement.executeQuery();
170             if (!rs.next()) {
171                 // word is not in table
172                 // insert the word
173                 insertStatement.setString(1, word);
174                 insertStatement.setString(2, category);
175                 insertStatement.execute();
176             }
177             // update the word count
178             updateStatement.setString(1, word);
179             updateStatement.setString(2, category);
180             updateStatement.execute();
181 
182         } catch (SQLException e) {
183             throw new WordsDataSourceException("Problem updating WordProbability", e);
184         } finally {
185             if (conn != null) {
186                 try {
187                     connectionManager.returnConnection(conn);
188                 } catch (SQLException e1) {
189                     // ignore
190                 }
191             }
192         }
193 
194     }
195 
196     public void addMatch(String category, String word) throws WordsDataSourceException {
197         if (category == null) {
198             throw new IllegalArgumentException("category cannot be null");
199         }
200         updateWordProbability(category, word, true);
201     }
202 
203     public void addMatch(String word) throws WordsDataSourceException {
204         updateWordProbability(ICategorisedClassifier.DEFAULT_CATEGORY, word, true);
205     }
206 
207     public void addNonMatch(String category, String word) throws WordsDataSourceException {
208         if (category == null) {
209             throw new IllegalArgumentException("category cannot be null");
210         }
211         updateWordProbability(category, word, false);
212     }
213 
214     public void addNonMatch(String word) throws WordsDataSourceException {
215         updateWordProbability(ICategorisedClassifier.DEFAULT_CATEGORY, word, false);
216     }
217 
218     /***
219      * Create the word_probability table if it does not already
220      * exist. Tested successfully with MySQL 4 & HSQLDB. See
221      * comments in code for Axion 1.0M1 issues. 
222      *   
223      * 
224      * @throws WordsDataSourceException
225      */
226     private void createTable() throws WordsDataSourceException {
227         Connection con = null;
228         try {
229             con = connectionManager.getConnection();
230 
231             // check if the word_probability table exists 
232             DatabaseMetaData dbm = con.getMetaData();
233             ResultSet rs = dbm.getTables(null, null, "word_probability", null);
234             if (!rs.next()) {
235                 // the table does not exist
236                 Statement stmt = con.createStatement();
237                 //	Under Axion 1.0M1, use 			
238                 //	 stmt.executeUpdate( "CREATE TABLE word_probability ( "
239                 //			+ " word			VARCHAR(255) NOT NULL,"
240                 //			+ " category		VARCHAR(20) NOT NULL,"
241                 //			+ " match_count		INTEGER NOT NULL,"
242                 //			+ " nonmatch_count	INTEGER NOT NULL, "
243                 //			+ " PRIMARY KEY(word, category) ) ");				
244                 stmt.executeUpdate("CREATE TABLE word_probability ( " 
245                             + " word			VARCHAR(255) NOT NULL," 
246                             + " category		VARCHAR(20) NOT NULL," 
247                             + " match_count		INT DEFAULT 0 NOT NULL," 
248                             + " nonmatch_count	INT DEFAULT 0 NOT NULL, " 
249                             + " PRIMARY KEY(word, category) ) ");
250             }
251         } catch (SQLException e) {
252             throw new WordsDataSourceException("Problem creating table", e); // we can't recover from this				
253         } finally {
254             if (con != null) {
255                 try {
256                     connectionManager.returnConnection(con);
257                 } catch (SQLException e1) {
258                     // ignore
259                 }
260             }
261         }
262     }
263 }