1 /*
2 * ====================================================================
3 *
4 * The Apache Software License, Version 1.1
5 *
6 * Copyright (c) 2003 Nick Lothian. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. The end-user documentation included with the redistribution, if
21 * any, must include the following acknowlegement:
22 * "This product includes software developed by the
23 * developers of Classifier4J (http://classifier4j.sf.net/)."
24 * Alternately, this acknowlegement may appear in the software itself,
25 * if and wherever such third-party acknowlegements normally appear.
26 *
27 * 4. The name "Classifier4J" must not be used to endorse or promote
28 * products derived from this software without prior written
29 * permission. For written permission, please contact
30 * http://sourceforge.net/users/nicklothian/.
31 *
32 * 5. Products derived from this software may not be called
33 * "Classifier4J", nor may "Classifier4J" appear in their names
34 * without prior written permission. For written permission, please
35 * contact http://sourceforge.net/users/nicklothian/.
36 *
37 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 * SUCH DAMAGE.
49 * ====================================================================
50 */
51 package net.sf.classifier4J.demo;
52
53 import java.io.FileInputStream;
54 import java.io.IOException;
55 import java.io.InputStream;
56 import java.sql.SQLException;
57
58 import net.sf.classifier4J.ClassifierException;
59 import net.sf.classifier4J.DefaultTokenizer;
60 import net.sf.classifier4J.IClassifier;
61 import net.sf.classifier4J.ITokenizer;
62 import net.sf.classifier4J.Utilities;
63 import net.sf.classifier4J.bayesian.BayesianClassifier;
64 import net.sf.classifier4J.bayesian.JDBMWordsDataSource;
65
66 /***
67 * @author Nick Lothian
68 * @author Peter Leschev
69 */
70 public class Analyser {
71
72 public static String connectionString = Trainer.connectionString;
73 public static String username = Trainer.username;
74 public static String password = Trainer.password;
75
76 static JDBMWordsDataSource wds;
77
78 private static IClassifier setupClassifier(ITokenizer tokenizer, String connString, String user, String pw) throws SQLException, IOException {
79 /*
80 DriverMangerJDBCConnectionManager cm = new DriverMangerJDBCConnectionManager(connString, user, pw);
81 JDBCWordsDataSource wds = new JDBCWordsDataSource(cm);
82 wds.createTable();
83 */
84 wds = new JDBMWordsDataSource("./database/");
85 wds.open();
86 return new BayesianClassifier(wds, tokenizer);
87 }
88
89 /***
90 * @returns Words Per Second
91 */
92 public static double useClassifier(ITokenizer tokenizer,
93 IClassifier classifier,
94 InputStream inputStream) throws IOException, ClassifierException {
95
96 // System.out.println("Using Classifier4J with " + classifier + " and " +
97 // tokenizer);
98
99 String contents = Utilities.getString(inputStream);
100 int length = tokenizer.tokenize(contents).length;
101
102 // System.out.println("Analysing " + length + " words. This may take a while.");
103
104 long startTime = System.currentTimeMillis();
105
106 double matchProb = classifier.classify(contents);
107
108 long endTime = System.currentTimeMillis();
109
110 double time = (double)(endTime - startTime) / (double)1000;
111
112 if (Double.compare(time, 0) == 0) {
113 time = 1;
114 }
115
116 double wordsPerSecond = length / time;
117
118 // System.out.println("Done. Took " + time + " seconds, which is " +
119 // wordsPerSecond + " words per second.");
120
121 // System.out.println("Match Probability = " + matchProb);
122 // System.out.println("Is considered a match: " + classifier.isMatch(matchProb));
123
124 return wordsPerSecond;
125 }
126
127 public static void main(String[] args) throws Exception {
128 System.out.println("This program reads in a single file and classifies it as a match or not.");
129 System.out.println("It should be run after the Trainer program.");
130
131
132 String filename = "./demodata/toanalyse.txt";
133
134
135 InputStream input = new FileInputStream(filename);
136 ITokenizer tokenizer = new DefaultTokenizer();
137 IClassifier classifier = setupClassifier(tokenizer, connectionString, username, password);
138
139 useClassifier(tokenizer, classifier, input);
140
141 wds.close();
142 }
143 /*
144 static {
145 try {
146 Class.forName("org.hsqldb.jdbcDriver");
147 //Class.forName("org.gjt.mm.mysql.Driver");
148 } catch (ClassNotFoundException e) {
149 e.printStackTrace();
150 }
151 }
152 */
153 }