我在java上为文本分类构建了naivebayes分类器。现在我正在尝试将它移植到hadoop上。我用mapper和reducer建立了模型,结果如下:
label1,word1 count
label1,word2 count
label1,word3 count
.
.
.
label2,word1 count
label2,word2 count
label2,word3 count
.
label3,word1 count
.
label4,word1 count
总共有4个标签,我必须把测试数据分为:建立模型后,我无法前进,如何分类测试数据使用模型和Map减少。这是我的电流code:-
public class TrainHadoop extends Configured implements Tool {
private static final String OUTPUT_PATH = "/user/nitin/interOutput";
private static String[] classes = {"CCAT", "ECAT", "GCAT", "MCAT"};
// for training data
public static class TrainMap extends Mapper<LongWritable, Text, Text, IntWritable>{
private static IntWritable one = new IntWritable(1);
private Text word = new Text();
public Vector<String> tokenizeDoc(String cur_doc) {
String[] words = cur_doc.split("\\s+");
Vector<String> tokens = new Vector<String>();
for (int i = 0; i < words.length; i++) {
words[i] = words[i].replaceAll("\\W|_", "");
if (words[i].length() > 0) {
tokens.add(words[i].toLowerCase());
}
}
return tokens;
}
public void map(LongWritable key, Text value, Context context )
throws IOException, InterruptedException{
String[] line = value.toString().split("\t");
String[] labelsArray = line[0].split(",");
Vector<String> indivWord = tokenizeDoc(line[1]);
List<String> finalLabelsArray = new ArrayList<>();
for (int i = 0; i < classes.length; i++) {
for (int j = 0; j < labelsArray.length; j++) {
if(classes[i].equals(labelsArray[j])){
finalLabelsArray.add(classes[i]);
}
}
}
word.set("labelsInstances");
context.write(word, new IntWritable(finalLabelsArray.size()));
for(String label : finalLabelsArray){
// variable and logic for storing total instances of each class
context.write(new Text(label), one);
// total no. of words for each class
context.write(new Text(label + "*"), new IntWritable(indivWord.size()));
// for each class calculating the no. of occurence of each word
for (int i = 0; i < indivWord.size(); i++) {
context.write(new Text(label + "^," + indivWord.get(i)), one);
}
// for vocab size
for (int i = 0; i < indivWord.size(); i++) {
context.write(new Text("A=" + indivWord.get(i)), one);
}
}
}
}
// mappers for test data set classifying
public static class TestMap1 extends Mapper<LongWritable, Text, Text, IntWritable>{
}
public static class TestMap2 extends Mapper<LongWritable, Text, Text, IntWritable>{
}
public static class TrainReduce extends Reducer<Text,IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException{
int sum = 0;
for(IntWritable val : values){
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static class TestReduce extends Reducer<Text,IntWritable,Text,IntWritable> {
}
@Override
public int run(String[] args) throws Exception {
/*
* Job 1
*/
Configuration trainConf = new Configuration();
Job trainJob = Job.getInstance(trainConf, "training");
trainJob.setJarByClass(TrainHadoop.class);
trainJob.setMapperClass(TrainMap.class);
trainJob.setReducerClass(TrainReduce.class);
//trainJob.setCombinerClass(Reduce.class);
trainJob.setInputFormatClass(TextInputFormat.class);
trainJob.setOutputFormatClass(TextOutputFormat.class);
// output from reducer and mapper
trainJob.setOutputKeyClass(Text.class);
trainJob.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(trainJob, new Path(args[0]));
FileOutputFormat.setOutputPath(trainJob, new Path(OUTPUT_PATH));
trainJob.waitForCompletion(true);
/*
* Job 2
*/
Configuration testConf = new Configuration();
Job testJob = Job.getInstance(testConf, "training");
testJob.setJarByClass(TrainHadoop.class);
testJob.setMapperClass(TestMap1.class);
testJob.setMapperClass(TestMap2.class);
testJob.setReducerClass(TestReduce.class);
//testJob.setCombinerClass(Reduce.class);
testJob.setInputFormatClass(TextInputFormat.class);
testJob.setOutputFormatClass(TextOutputFormat.class);
// output from reducer and mapper
testJob.setOutputKeyClass(Text.class);
testJob.setOutputValueClass(IntWritable.class);
MultipleInputs.addInputPath(testJob, new Path(OUTPUT_PATH + "/part-r-[0-9]*"), TextInputFormat.class, TestMap1.class);
MultipleInputs.addInputPath(testJob, new Path(args[1]), TextInputFormat.class, TestMap2.class);
FileOutputFormat.setOutputPath(testJob, new Path(args[2]));
return testJob.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception{
ToolRunner.run(new Configuration(), new TrainHadoop(), args);
}
}
暂无答案!
目前还没有任何答案,快来回答吧!