Input format for MaxEnt OpenNLP implementation? -
i'm trying use opennlp implementation of maximum entropy classifier seems documentation quite lacking , despite library apparently designed easy of use cannot find single example and/or specification input file format (i.e., training set).
anybody knows find or minimal working example of training?
opennlp's format quite flexible. if want use maxent classifier in opennlp there few steps involved.
here sample code comments:
package example; import java.io.file; import java.io.ioexception; import java.nio.charset.charset; import java.util.arrays; import java.util.hashmap; import java.util.map; import opennlp.tools.ml.maxent.gistrainer; import opennlp.tools.ml.model.event; import opennlp.tools.ml.model.maxentmodel; import opennlp.tools.tokenize.whitespacetokenizer; import opennlp.tools.util.filterobjectstream; import opennlp.tools.util.markablefileinputstreamfactory; import opennlp.tools.util.objectstream; import opennlp.tools.util.plaintextbylinestream; import opennlp.tools.util.trainingparameters; public class readdata { public static void main(string[] args) throws exception{ // data file ... // format <list of features separated spaces> <outcome> // change file fit needs file f=new file("football.dat"); // need create objectstream of events trainer.. // first create inputstreamfactory -- given file can create inputstream, required resetting... markablefileinputstreamfactory factory=new markablefileinputstreamfactory(f); // create plaintextbylineinputstream -- note: can create own stream can handle binary files or data // -- crosses 2 line... objectstream<string> stream=new plaintextbylinestream(factory, charset.defaultcharset()); // have stream of string need convert stream of events... // use custom filterobjectstream takes line, breaks tokens, // uses except last features [context] , last token outcome class objectstream<event> eventstream=new filterobjectstream<string, event>(stream) { @override public event read() throws ioexception { string line=samples.read(); if (line==null) return null; string[] parts=whitespacetokenizer.instance.tokenize(line); string[] context=arrays.copyof(parts, parts.length-1); system.out.println(parts[parts.length-1]+" "+arrays.tostring(context)); return new event(parts[parts.length-1], context); } }; trainingparameters parameters=new trainingparameters(); // default opennlp uses cutoff of 5 (a feature has occur 5 times before used) // use 1 small dataset parameters.put(gistrainer.cutoff_param, 1); gistrainer trainer=new gistrainer(); // report map supposed mark when default values assigned... map<string,string> reportmap=new hashmap<>(); // dont forget initialize trainer!!! trainer.init(parameters, reportmap); maxentmodel model=trainer.train(eventstream); // have model -- should test on test set, // toy example... resetting eventstream. eventstream.reset(); event evt=null; while ( (evt=eventstream.read())!=null ){ system.out.print(arrays.tostring(evt.getcontext())+": "); // evaluate context event using our model. // want calculate summary statistics.. double[] p=model.eval(evt.getcontext()); system.out.print(model.getbestoutcome(p)+" "); if (model.getbestoutcome(p).equals(evt.getoutcome())){ system.out.println("correct"); }else{ system.out.println("incorrect"); } } } }
football.dat:
home=man_united beckham=false scholes=true neville=true henry=true kanu=true parlour=false ferguson=confident wengler=tense arsenal_lost_previous man_united_won_previous arsenal home=man_united beckham=true scholes=false neville=true henry=false kanu=true parlour=false ferguson=tense wengler=confident arsenal_won_previous man_united_lost_previous man_united home=man_united beckham=false scholes=true neville=true henry=true kanu=true parlour=false ferguson=tense wengler=tense arsenal_lost_previous man_united_won_previous tie home=man_united beckham=true scholes=true neville=false henry=true kanu=false parlour=false ferguson=confident wengler=confident arsenal_won_previous man_united_won_previous tie home=man_united beckham=false scholes=true neville=true henry=true kanu=true parlour=false ferguson=confident wengler=tense arsenal_won_previous man_united_won_previous arsenal home=man_united beckham=false scholes=true neville=true henry=false kanu=true parlour=false ferguson=confident wengler=confident arsenal_won_previous man_united_won_previous man_united home=man_united beckham=true scholes=true neville=false henry=true kanu=true parlour=false ferguson=confident wengler=tense arsenal_won_previous man_united_won_previous man_united home=arsenal beckham=false scholes=true neville=true henry=true kanu=true parlour=false ferguson=confident wengler=tense arsenal_lost_previous man_united_won_previous arsenal home=arsenal beckham=true scholes=false neville=true henry=false kanu=true parlour=false ferguson=tense wengler=confident arsenal_won_previous man_united_lost_previous arsenal home=arsenal beckham=false scholes=true neville=true henry=true kanu=true parlour=false ferguson=tense wengler=tense arsenal_lost_previous man_united_won_previous tie home=arsenal beckham=true scholes=true neville=false henry=true kanu=false parlour=false ferguson=confident wengler=confident arsenal_won_previous man_united_won_previous man_united home=arsenal beckham=false scholes=true neville=true henry=true kanu=true parlour=false ferguson=confident wengler=tense arsenal_won_previous man_united_won_previous arsenal home=arsenal beckham=false scholes=true neville=true henry=false kanu=true parlour=false ferguson=confident wengler=confident arsenal_won_previous man_united_won_previous man_united home=arsenal beckham=true scholes=true neville=false henry=true kanu=true parlour=false ferguson=confident wengler=tense arsenal_won_previous man_united_won_previous arsenal
hope helps
Comments
Post a Comment