1 /*
2 * ====================================================================
3 *
4 * The Apache Software License, Version 1.1
5 *
6 * Copyright (c) 2003 Nick Lothian. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. The end-user documentation included with the redistribution, if
21 * any, must include the following acknowlegement:
22 * "This product includes software developed by the
23 * developers of Classifier4J (http://classifier4j.sf.net/)."
24 * Alternately, this acknowlegement may appear in the software itself,
25 * if and wherever such third-party acknowlegements normally appear.
26 *
27 * 4. The name "Classifier4J" must not be used to endorse or promote
28 * products derived from this software without prior written
29 * permission. For written permission, please contact
30 * http://sourceforge.net/users/nicklothian/.
31 *
32 * 5. Products derived from this software may not be called
33 * "Classifier4J", nor may "Classifier4J" appear in their names
34 * without prior written permission. For written permission, please
35 * contact http://sourceforge.net/users/nicklothian/.
36 *
37 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 * SUCH DAMAGE.
49 * ====================================================================
50 */
51 package net.sf.classifier4J.demo;
52
53 import java.io.File;
54 import java.io.FileInputStream;
55 import java.io.IOException;
56 import java.io.InputStream;
57 import java.sql.SQLException;
58
59 import net.sf.classifier4J.ClassifierException;
60 import net.sf.classifier4J.DefaultTokenizer;
61 import net.sf.classifier4J.ITokenizer;
62 import net.sf.classifier4J.ITrainableClassifier;
63 import net.sf.classifier4J.Utilities;
64 import net.sf.classifier4J.bayesian.BayesianClassifier;
65 import net.sf.classifier4J.bayesian.JDBMWordsDataSource;
66
67 /***
68 * @author Nick Lothian
69 * @author Peter Leschev
70 */
71 public class Trainer {
72
73 /***
74 * Given an inputStream of data, a tokenizer this method trains the
75 * specified classifier.
76 *
77 * @returns Words Per Second
78 */
79 public static double trainClassifier(ITokenizer tokenizer,
80 ITrainableClassifier classifier,
81 boolean isMatch,
82 InputStream inputStream) throws IOException, ClassifierException {
83
84 // System.out.println("Training Classifier4J using " + classifier + " and " +
85 // tokenizer);
86
87 String contents = Utilities.getString(inputStream);
88 int length = tokenizer.tokenize(contents).length;
89
90 long startTime = System.currentTimeMillis();
91
92 if (isMatch) {
93 // System.out.println(length +
94 // " matching words. This may take a while.");
95 classifier.teachMatch(contents);
96 } else {
97 // System.out.println(length +
98 // " non-matching words. This may take a while.");
99 classifier.teachNonMatch(contents);
100 }
101
102 long endTime = System.currentTimeMillis();
103
104 double time = (double)(endTime - startTime) / (double)1000;
105
106 if (Double.compare(time, 0) == 0) {
107 time = 1;
108 }
109
110 double wordsPerSecond = length / time;
111
112 // System.out.println("Done. Took " + time + " seconds, which is " +
113 // wordsPerSecond + " words per second.");
114
115 return wordsPerSecond;
116 }
117
118 public static String connectionString = "jdbc:hsqldb:./database/";
119 public static String username = "sa";
120 public static String password = "";
121
122 static JDBMWordsDataSource wds;
123
124 private static ITrainableClassifier setupClassifier(String connString, String user, String pw) throws SQLException, IOException {
125 /*
126 DriverMangerJDBCConnectionManager cm = new DriverMangerJDBCConnectionManager(connString, user, pw);
127 JDBCWordsDataSource wds = new JDBCWordsDataSource(cm);
128 wds.createTable();
129 */
130 wds = new JDBMWordsDataSource("./database/");
131 wds.open();
132 return new BayesianClassifier(wds);
133 }
134
135 public static void main(String[] args) throws Exception {
136 System.out.println("This program reads in two files, one of which is considered to define a match.");
137 System.out.println("These two files are analysed by Classifier4J and the resulting word probabilities are loaded into a JDBM database.");
138 System.out.println("");
139 System.out.println("To reset the word probabilities, delete the \"database\" directory which is created.");
140
141 File dir = new File("./database");
142 dir.mkdir();
143
144 ITrainableClassifier classifier = setupClassifier(connectionString, username, password);
145 ITokenizer tokenizer = new DefaultTokenizer();
146
147 trainClassifier(tokenizer,
148 classifier,
149 true,
150 new FileInputStream("./demodata/match.txt"));
151
152 trainClassifier(tokenizer,
153 classifier,
154 false,
155 new FileInputStream("./demodata/nonmatch.txt"));
156
157 wds.close();
158 }
159 /*
160 static {
161 try {
162 Class.forName("org.hsqldb.jdbcDriver");
163 } catch (ClassNotFoundException e) {
164 e.printStackTrace();
165 }
166 }
167 */
168 }