%line | %branch | |||||||||
---|---|---|---|---|---|---|---|---|---|---|
net.sf.classifier4J.demo.Trainer |
|
|
1 | /* |
|
2 | * ==================================================================== |
|
3 | * |
|
4 | * The Apache Software License, Version 1.1 |
|
5 | * |
|
6 | * Copyright (c) 2003 Nick Lothian. All rights reserved. |
|
7 | * |
|
8 | * Redistribution and use in source and binary forms, with or without |
|
9 | * modification, are permitted provided that the following conditions |
|
10 | * are met: |
|
11 | * |
|
12 | * 1. Redistributions of source code must retain the above copyright |
|
13 | * notice, this list of conditions and the following disclaimer. |
|
14 | * |
|
15 | * 2. Redistributions in binary form must reproduce the above copyright |
|
16 | * notice, this list of conditions and the following disclaimer in |
|
17 | * the documentation and/or other materials provided with the |
|
18 | * distribution. |
|
19 | * |
|
20 | * 3. The end-user documentation included with the redistribution, if |
|
21 | * any, must include the following acknowlegement: |
|
22 | * "This product includes software developed by the |
|
23 | * developers of Classifier4J (http://classifier4j.sf.net/)." |
|
24 | * Alternately, this acknowlegement may appear in the software itself, |
|
25 | * if and wherever such third-party acknowlegements normally appear. |
|
26 | * |
|
27 | * 4. The name "Classifier4J" must not be used to endorse or promote |
|
28 | * products derived from this software without prior written |
|
29 | * permission. For written permission, please contact |
|
30 | * http://sourceforge.net/users/nicklothian/. |
|
31 | * |
|
32 | * 5. Products derived from this software may not be called |
|
33 | * "Classifier4J", nor may "Classifier4J" appear in their names |
|
34 | * without prior written permission. For written permission, please |
|
35 | * contact http://sourceforge.net/users/nicklothian/. |
|
36 | * |
|
37 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
|
38 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
|
39 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
40 | * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
|
41 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
42 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
43 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
|
44 | * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
45 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
|
46 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
|
47 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
48 | * SUCH DAMAGE. |
|
49 | * ==================================================================== |
|
50 | */ |
|
51 | package net.sf.classifier4J.demo; |
|
52 | ||
53 | import java.io.File; |
|
54 | import java.io.FileInputStream; |
|
55 | import java.io.IOException; |
|
56 | import java.io.InputStream; |
|
57 | import java.sql.SQLException; |
|
58 | ||
59 | import net.sf.classifier4J.ClassifierException; |
|
60 | import net.sf.classifier4J.DefaultTokenizer; |
|
61 | import net.sf.classifier4J.ITokenizer; |
|
62 | import net.sf.classifier4J.ITrainableClassifier; |
|
63 | import net.sf.classifier4J.Utilities; |
|
64 | import net.sf.classifier4J.bayesian.BayesianClassifier; |
|
65 | import net.sf.classifier4J.bayesian.JDBMWordsDataSource; |
|
66 | ||
67 | /** |
|
68 | * @author Nick Lothian |
|
69 | * @author Peter Leschev |
|
70 | */ |
|
71 | 0 | public class Trainer { |
72 | ||
73 | /** |
|
74 | * Given an inputStream of data, a tokenizer this method trains the |
|
75 | * specified classifier. |
|
76 | * |
|
77 | * @returns Words Per Second |
|
78 | */ |
|
79 | public static double trainClassifier(ITokenizer tokenizer, |
|
80 | ITrainableClassifier classifier, |
|
81 | boolean isMatch, |
|
82 | InputStream inputStream) throws IOException, ClassifierException { |
|
83 | ||
84 | // System.out.println("Training Classifier4J using " + classifier + " and " + |
|
85 | // tokenizer); |
|
86 | ||
87 | 0 | String contents = Utilities.getString(inputStream); |
88 | 0 | int length = tokenizer.tokenize(contents).length; |
89 | ||
90 | 0 | long startTime = System.currentTimeMillis(); |
91 | ||
92 | 0 | if (isMatch) { |
93 | // System.out.println(length + |
|
94 | // " matching words. This may take a while."); |
|
95 | 0 | classifier.teachMatch(contents); |
96 | } else { |
|
97 | // System.out.println(length + |
|
98 | // " non-matching words. This may take a while."); |
|
99 | 0 | classifier.teachNonMatch(contents); |
100 | } |
|
101 | ||
102 | 0 | long endTime = System.currentTimeMillis(); |
103 | ||
104 | 0 | double time = (class="keyword">double)(endTime - startTime) / (class="keyword">double)1000; |
105 | ||
106 | 0 | if (Double.compare(time, 0) == 0) { |
107 | 0 | time = 1; |
108 | } |
|
109 | ||
110 | 0 | double wordsPerSecond = length / time; |
111 | ||
112 | // System.out.println("Done. Took " + time + " seconds, which is " + |
|
113 | // wordsPerSecond + " words per second."); |
|
114 | ||
115 | 0 | return wordsPerSecond; |
116 | } |
|
117 | ||
118 | 0 | public static String connectionString = "jdbc:hsqldb:./database/"; |
119 | 0 | public static String username = "sa"; |
120 | 0 | public static String password = ""; |
121 | ||
122 | static JDBMWordsDataSource wds; |
|
123 | ||
124 | private static ITrainableClassifier setupClassifier(String connString, String user, String pw) throws SQLException, IOException { |
|
125 | /* |
|
126 | DriverMangerJDBCConnectionManager cm = new DriverMangerJDBCConnectionManager(connString, user, pw); |
|
127 | JDBCWordsDataSource wds = new JDBCWordsDataSource(cm); |
|
128 | wds.createTable(); |
|
129 | */ |
|
130 | 0 | wds = new JDBMWordsDataSource("./database/"); |
131 | 0 | wds.open(); |
132 | 0 | return new BayesianClassifier(wds); |
133 | } |
|
134 | ||
135 | public static void main(String[] args) throws Exception { |
|
136 | 0 | System.out.println("This program reads in two files, one of which is considered to define a match."); |
137 | 0 | System.out.println("These two files are analysed by Classifier4J and the resulting word probabilities are loaded into a JDBM database."); |
138 | 0 | System.out.println(""); |
139 | 0 | System.out.println("To reset the word probabilities, delete the \"database\" directory which is created."); |
140 | ||
141 | 0 | File dir = new File("./database"); |
142 | 0 | dir.mkdir(); |
143 | ||
144 | 0 | ITrainableClassifier classifier = setupClassifier(connectionString, username, password); |
145 | 0 | ITokenizer tokenizer = new DefaultTokenizer(); |
146 | ||
147 | 0 | trainClassifier(tokenizer, |
148 | classifier, |
|
149 | true, |
|
150 | new FileInputStream("./demodata/match.txt")); |
|
151 | ||
152 | 0 | trainClassifier(tokenizer, |
153 | classifier, |
|
154 | false, |
|
155 | new FileInputStream("./demodata/nonmatch.txt")); |
|
156 | ||
157 | 0 | wds.close(); |
158 | 0 | } |
159 | /* |
|
160 | static { |
|
161 | try { |
|
162 | Class.forName("org.hsqldb.jdbcDriver"); |
|
163 | } catch (ClassNotFoundException e) { |
|
164 | e.printStackTrace(); |
|
165 | } |
|
166 | } |
|
167 | */ |
|
168 | } |
This report is generated by jcoverage, Maven and Maven JCoverage Plugin. |