1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51 package net.sf.classifier4J.demo;
52
53 import java.io.File;
54 import java.io.FileInputStream;
55 import java.io.IOException;
56 import java.io.InputStream;
57 import java.sql.SQLException;
58
59 import net.sf.classifier4J.ClassifierException;
60 import net.sf.classifier4J.DefaultTokenizer;
61 import net.sf.classifier4J.ITokenizer;
62 import net.sf.classifier4J.ITrainableClassifier;
63 import net.sf.classifier4J.Utilities;
64 import net.sf.classifier4J.bayesian.BayesianClassifier;
65 import net.sf.classifier4J.bayesian.JDBMWordsDataSource;
66
67 /***
68 * @author Nick Lothian
69 * @author Peter Leschev
70 */
71 public class Trainer {
72
73 /***
74 * Given an inputStream of data, a tokenizer this method trains the
75 * specified classifier.
76 *
77 * @returns Words Per Second
78 */
79 public static double trainClassifier(ITokenizer tokenizer,
80 ITrainableClassifier classifier,
81 boolean isMatch,
82 InputStream inputStream) throws IOException, ClassifierException {
83
84
85
86
87 String contents = Utilities.getString(inputStream);
88 int length = tokenizer.tokenize(contents).length;
89
90 long startTime = System.currentTimeMillis();
91
92 if (isMatch) {
93
94
95 classifier.teachMatch(contents);
96 } else {
97
98
99 classifier.teachNonMatch(contents);
100 }
101
102 long endTime = System.currentTimeMillis();
103
104 double time = (double)(endTime - startTime) / (double)1000;
105
106 if (Double.compare(time, 0) == 0) {
107 time = 1;
108 }
109
110 double wordsPerSecond = length / time;
111
112
113
114
115 return wordsPerSecond;
116 }
117
118 public static String connectionString = "jdbc:hsqldb:./database/";
119 public static String username = "sa";
120 public static String password = "";
121
122 static JDBMWordsDataSource wds;
123
124 private static ITrainableClassifier setupClassifier(String connString, String user, String pw) throws SQLException, IOException {
125
126
127
128
129
130 wds = new JDBMWordsDataSource("./database/");
131 wds.open();
132 return new BayesianClassifier(wds);
133 }
134
135 public static void main(String[] args) throws Exception {
136 System.out.println("This program reads in two files, one of which is considered to define a match.");
137 System.out.println("These two files are analysed by Classifier4J and the resulting word probabilities are loaded into a JDBM database.");
138 System.out.println("");
139 System.out.println("To reset the word probabilities, delete the \"database\" directory which is created.");
140
141 File dir = new File("./database");
142 dir.mkdir();
143
144 ITrainableClassifier classifier = setupClassifier(connectionString, username, password);
145 ITokenizer tokenizer = new DefaultTokenizer();
146
147 trainClassifier(tokenizer,
148 classifier,
149 true,
150 new FileInputStream("./demodata/match.txt"));
151
152 trainClassifier(tokenizer,
153 classifier,
154 false,
155 new FileInputStream("./demodata/nonmatch.txt"));
156
157 wds.close();
158 }
159
160
161
162
163
164
165
166
167
168 }