如何在action book中的mahout中运行示例

hyrbngr7  于 2021-06-03  发布在  Hadoop
关注(0)|答案(2)|浏览(361)

我正在尝试运行第7章中的helloworld示例。我在eclipse中创建了以下内容,然后将其打包到jar:-

  1. package com.mycode.mahout
  2. import java.io.File;
  3. import java.io.IOException;
  4. import java.util.ArrayList;
  5. import java.util.List;
  6. import org.apache.hadoop.conf.Configuration;
  7. import org.apache.hadoop.fs.FileSystem;
  8. import org.apache.hadoop.fs.Path;
  9. import org.apache.hadoop.io.IntWritable;
  10. import org.apache.hadoop.io.LongWritable;
  11. import org.apache.hadoop.io.SequenceFile;
  12. import org.apache.hadoop.io.Text;
  13. import org.apache.mahout.clustering.WeightedVectorWritable;
  14. import org.apache.mahout.clustering.kmeans.Cluster;
  15. import org.apache.mahout.clustering.kmeans.KMeansDriver;
  16. import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
  17. import org.apache.mahout.math.RandomAccessSparseVector;
  18. import org.apache.mahout.math.Vector;
  19. import org.apache.mahout.math.VectorWritable;
  20. public class SimpleKMeansClustering {
  21. public static final double[][] points = { {1, 1}, {2, 1}, {1, 2},
  22. {2, 2}, {3, 3}, {8, 8},
  23. {9, 8}, {8, 9}, {9, 9}};
  24. public static void writePointsToFile(List<Vector> points,
  25. String fileName,
  26. FileSystem fs,
  27. Configuration conf) throws IOException {
  28. Path path = new Path(fileName);
  29. SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
  30. path, LongWritable.class, VectorWritable.class);
  31. long recNum = 0;
  32. VectorWritable vec = new VectorWritable();
  33. for (Vector point : points) {
  34. vec.set(point);
  35. writer.append(new LongWritable(recNum++), vec);
  36. }
  37. writer.close();
  38. }
  39. public static List<Vector> getPoints(double[][] raw) {
  40. List<Vector> points = new ArrayList<Vector>();
  41. for (int i = 0; i < raw.length; i++) {
  42. double[] fr = raw[i];
  43. Vector vec = new RandomAccessSparseVector(fr.length);
  44. vec.assign(fr);
  45. points.add(vec);
  46. }
  47. return points;
  48. }
  49. public static void main(String args[]) throws Exception {
  50. int k = 2;
  51. List<Vector> vectors = getPoints(points);
  52. File testData = new File("testdata");
  53. if (!testData.exists()) {
  54. testData.mkdir();
  55. }
  56. testData = new File("testdata/points");
  57. if (!testData.exists()) {
  58. testData.mkdir();
  59. }
  60. Configuration conf = new Configuration();
  61. FileSystem fs = FileSystem.get(conf);
  62. writePointsToFile(vectors, "testdata/points/file1", fs, conf);
  63. Path path = new Path("testdata/clusters/part-00000");
  64. SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
  65. path, Text.class, Cluster.class);
  66. for (int i = 0; i < k; i++) {
  67. Vector vec = vectors.get(i);
  68. Cluster cluster = new Cluster(vec, i, new EuclideanDistanceMeasure());
  69. writer.append(new Text(cluster.getIdentifier()), cluster);
  70. }
  71. writer.close();
  72. KMeansDriver.run(conf, new Path("testdata/points"), new Path("testdata/clusters"),
  73. new Path("output"), new EuclideanDistanceMeasure(), 0.001, 10,
  74. true, false);
  75. SequenceFile.Reader reader = new SequenceFile.Reader(fs,
  76. new Path("output/" + Cluster.CLUSTERED_POINTS_DIR
  77. + "/part-m-00000"), conf);
  78. IntWritable key = new IntWritable();
  79. WeightedVectorWritable value = new WeightedVectorWritable();
  80. while (reader.next(key, value)) {
  81. System.out.println(value.toString() + " belongs to cluster "
  82. + key.toString());
  83. }
  84. reader.close();
  85. }
  86. }

我把它打包成myjob.jar
现在我该如何在我的集群上执行这个呢?
我试过了following:-

  1. hadoop jar myjob.jar com.mycode.mahout.SimpleKMeansClustering
  2. java -jar myjob.jar
  3. java -cp myjob.jar

我跟在后面error:-

  1. [root@node1 tmp]# hadoop jar mahoutfirst.jar com.mahout.emc.SimpleKMeansClustering
  2. Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/mahout/math/Vector`
  3. at java.lang.Class.forName0(Native Method)
  4. at java.lang.Class.forName(Class.java:270)
  5. at org.apache.hadoop.util.RunJar.main(RunJar.java:201)
  6. Caused by: java.lang.ClassNotFoundException: org.apache.mahout.math.Vector
  7. at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
  8. at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
  9. at java.security.AccessController.doPrivileged(Native Method)
  10. at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
  11. at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
  12. at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
  13. ... 3 more

请告知使用mahout编写的代码的正确运行方式。

iaqfqrcu

iaqfqrcu1#

虽然已经很晚了,但我也遇到了类似的问题,下面的方法确实对我有用,因为我不想使用maven:
1) 转到mahout安装目录并查找*job.jar作为

  1. ls /usr/lib/mahout/
  2. conf lib mahout-core-0.5-cdh3u3-job.jar mahout-examples-0.5-cdh3u3-job.jar mahout-taste-webapp-0.5-cdh3u3.war

2) 将mahout-examples-0.5-cdh3u3-job.jar复制到代码所在的目录
3) 使用mahout提供的“job”jar文件。它打包了所有依赖项。你也需要添加你的类。当您使用hadoop和mahout库编译类时,您已经准备好了.class文件。
4) 将类文件添加到目录中的job jar mahout-core-0.5-cdh3u3-job.jar:

  1. jar uf mahout-core-0.5-cdh3u3-job.jar SimpleKMeansClustering.class

4) 使用您的代码运行hadoop jar:

  1. hadoop jar mahout-core-0.5-cdh3u3-job.jar SimpleKMeansClustering

5) 在Map缩小作业的末尾,您可以看到:

  1. 1.0: [1.000, 1.000] belongs to cluster 0
  2. 1.0: [2.000, 1.000] belongs to cluster 0
  3. 1.0: [1.000, 2.000] belongs to cluster 0
  4. 1.0: [2.000, 2.000] belongs to cluster 0
  5. 1.0: [3.000, 3.000] belongs to cluster 0
  6. 1.0: [8.000, 8.000] belongs to cluster 1
  7. 1.0: [9.000, 8.000] belongs to cluster 1
  8. 1.0: [8.000, 9.000] belongs to cluster 1
  9. 1.0: [9.000, 9.000] belongs to cluster 1
展开查看全部
col17t5w

col17t5w2#

查看上面的not class defind异常,您可能需要在hadoop作业中包含mahout相关的jar(我猜是mahout-core.jar)。
要在整个集群中将jar传递给Map器,可能需要使用distributedcache或 -libjar hadoop选项。后面的想法在这里解释。

相关问题