org.apache.hadoop.mapreduce.RecordReader类的使用及代码示例

x33g5p2x  于2022-01-28 转载在 其他  
字(14.7k)|赞(0)|评价(0)|浏览(146)

本文整理了Java中org.apache.hadoop.mapreduce.RecordReader类的一些代码示例,展示了RecordReader类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。RecordReader类的具体详情如下:
包路径:org.apache.hadoop.mapreduce.RecordReader
类名称:RecordReader

RecordReader介绍

[英]The record reader breaks the data into key/value pairs for input to the Mapper.
[中]记录读取器将数据分成键/值对,以输入映射器。

代码示例

代码示例来源:origin: thinkaurelius/titan

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
  while (reader.nextKeyValue()) {
    // TODO titan05 integration -- the duplicate() call may be unnecessary
    final TinkerVertex maybeNullTinkerVertex =
        deser.readHadoopVertex(reader.getCurrentKey(), reader.getCurrentValue());
    if (null != maybeNullTinkerVertex) {
      vertex = new VertexWritable(maybeNullTinkerVertex);
      //vertexQuery.filterRelationsOf(vertex); // TODO reimplement vertexquery filtering
      return true;
    }
  }
  return false;
}

代码示例来源:origin: apache/flink

@Override
public void open(HadoopInputSplit split) throws IOException {
  // enforce sequential open() calls
  synchronized (OPEN_MUTEX) {
    TaskAttemptContext context = new TaskAttemptContextImpl(configuration, new TaskAttemptID());
    try {
      this.recordReader = this.mapreduceInputFormat
          .createRecordReader(split.getHadoopInputSplit(), context);
      this.recordReader.initialize(split.getHadoopInputSplit(), context);
    } catch (InterruptedException e) {
      throw new IOException("Could not create RecordReader.", e);
    } finally {
      this.fetched = false;
    }
  }
}

代码示例来源:origin: apache/hive

@Override
public boolean hasNext() {
 try {
  boolean retVal = curRecReader.nextKeyValue();
  if (retVal) {
   return true;
  }
  // if its false, we need to close recordReader.
  curRecReader.close();
  return false;
 } catch (IOException e) {
  throw new RuntimeException(e);
 } catch (InterruptedException e) {
  throw new RuntimeException(e);
 }
}

代码示例来源:origin: apache/flink

@Override
public Tuple2<K, V> nextRecord(Tuple2<K, V> record) throws IOException {
  if (!this.fetched) {
    fetchNext();
  }
  if (!this.hasNext) {
    return null;
  }
  try {
    record.f0 = recordReader.getCurrentKey();
    record.f1 = recordReader.getCurrentValue();
  } catch (InterruptedException e) {
    throw new IOException("Could not get KeyValue pair.", e);
  }
  this.fetched = false;
  return record;
}

代码示例来源:origin: ZuInnoTe/hadoopcryptoledger

@Test
public void readBitcoinRawBlockInputFormatBlockVersion1() throws IOException, InterruptedException {
 Configuration conf = new Configuration(defaultConf);
 ClassLoader classLoader = getClass().getClassLoader();
 String fileName="version1.blk";
 String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
 Path file = new Path(fileNameBlock);
 Job job = Job.getInstance(conf);
 FileInputFormat.setInputPaths(job, file);
 BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
 List<InputSplit> splits = format.getSplits(job);
 TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
 assertEquals( 1, splits.size(),"Only one split generated for block version 1");
   RecordReader<BytesWritable, BytesWritable> reader = format.createRecordReader(splits.get(0), context);
 assertNotNull( reader,"Format returned  null RecordReader");
 reader.initialize(splits.get(0),context);
 BytesWritable key = new BytesWritable();	
 BytesWritable block = new BytesWritable();
 assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block");
 block=reader.getCurrentValue();
 assertEquals( 482, block.getLength(),"Random block version 1  must have size of 482 bytes");
   assertFalse( reader.nextKeyValue(),"No further blocks in block version 1");
 reader.close();
}

代码示例来源:origin: ZuInnoTe/hadoopoffice

@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedNegative() throws IOException, InterruptedException {
  Configuration conf = new Configuration(defaultConf);
  ClassLoader classLoader = getClass().getClassLoader();
  String fileName = "excel2013encrypt.xlsx";
  String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
  Path file = new Path(fileNameSpreadSheet);
  // set locale to the one of the test data
  conf.set("hadoopoffice.read.locale.bcp47", "de");
  // for decryption simply set the password
  conf.set("hadoopoffice.read.security.crypt.password", "test2");
  Job job = Job.getInstance(conf);
  FileInputFormat.setInputPaths(job, file);
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
  ExcelFileInputFormat format = new ExcelFileInputFormat();
  List<InputSplit> splits = format.getSplits(job);
  assertEquals(1, splits.size(), "Only one split generated for Excel file");
  RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);
  InterruptedException ex = assertThrows(InterruptedException.class,
      () -> reader.initialize(splits.get(0), context), "Exception is thrown in case of wrong password");
}

代码示例来源:origin: apache/hive

private void writeThenReadByRecordReader(int intervalRecordCount,
           int writeCount, int splitNumber, long maxSplitSize, CompressionCodec codec)
 throws IOException, InterruptedException {
 Path testDir = new Path(System.getProperty("test.tmp.dir", ".")
  + "/mapred/testsmallfirstsplit");
 Path testFile = new Path(testDir, "test_rcfile");
 fs.delete(testFile, true);
 Configuration cloneConf = new Configuration(conf);
 RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
 cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
 Configuration jonconf = new Configuration(cloneConf);
 jonconf.set("mapred.input.dir", testDir.toString());
 JobContext context = new Job(jonconf);
 HiveConf.setLongVar(context.getConfiguration(),
   HiveConf.ConfVars.MAPREDMAXSPLITSIZE, maxSplitSize);
    new TaskAttemptID());
  RecordReader<LongWritable, BytesRefArrayWritable> rr = inputFormat.createRecordReader(splits.get(i), tac);
  rr.initialize(splits.get(i), tac);
  while (rr.nextKeyValue()) {
   readCount++;

代码示例来源:origin: apache/avro

Configuration conf = new Configuration();
expect(context.getConfiguration()).andReturn(conf).anyTimes();
recordReader.initialize(inputSplit, context);
  0.0f, recordReader.getProgress(), 0.0f);
assertTrue("Expected at least one record", recordReader.nextKeyValue());
key = recordReader.getCurrentKey();
value = recordReader.getCurrentValue();
  key == recordReader.getCurrentKey());
assertTrue("getCurrentValue() returned different values for the same record",
  value == recordReader.getCurrentValue());
assertTrue("Expected to read a second record", recordReader.nextKeyValue());
key = recordReader.getCurrentKey();
value = recordReader.getCurrentValue();
  1.0f, recordReader.getProgress(), 0.0f);
assertFalse("Expected only 2 records", recordReader.nextKeyValue());
recordReader.close();

代码示例来源:origin: apache/tinkerpop

@Override
public void initialize(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
  final Configuration configuration = taskAttemptContext.getConfiguration();
  final InputFormat<NullWritable, VertexWritable> inputFormat = ReflectionUtils.newInstance(configuration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class), configuration);
  if (!(inputFormat instanceof GraphFilterAware) && configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null)
    this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER);
  this.recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext);
  this.recordReader.initialize(inputSplit, taskAttemptContext);
}

代码示例来源:origin: apache/jena

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException,
    InterruptedException {
  LOG.debug("initialize({}, {})", genericSplit, context);
  // Assuming file split
  if (!(genericSplit instanceof FileSplit))
    throw new IOException("This record reader only supports FileSplit inputs");
  // Find RDF language
  FileSplit split = (FileSplit) genericSplit;
  Path path = split.getPath();
  Lang lang = RDFLanguages.filenameToLang(path.getName());
  if (lang == null)
    throw new IOException("There is no registered RDF language for the input file " + path.toString());
  // Select the record reader and initialize
  this.reader = this.selectRecordReader(lang);
  this.reader.initialize(split, context);
}

代码示例来源:origin: org.janusgraph/janusgraph-hadoop

@Override
public void initialize(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
  reader.initialize(inputSplit, taskAttemptContext);
  final Configuration conf = taskAttemptContext.getConfiguration();
  if (conf.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null) {
    graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(conf),
      Constants.GREMLIN_HADOOP_GRAPH_FILTER);
  }
}

代码示例来源:origin: apache/incubator-gobblin

@Test
public void testRecordReader()
  throws Exception {
 List<String> paths = Lists.newArrayList("/path1", "/path2");
 GobblinWorkUnitsInputFormat.GobblinSplit split = new GobblinWorkUnitsInputFormat.GobblinSplit(paths);
 GobblinWorkUnitsInputFormat inputFormat = new GobblinWorkUnitsInputFormat();
 RecordReader<LongWritable, Text> recordReader =
   inputFormat.createRecordReader(split, new TaskAttemptContextImpl(new Configuration(), new TaskAttemptID("a", 1,
   TaskType.MAP, 1, 1)));
 recordReader.nextKeyValue();
 Assert.assertEquals(recordReader.getCurrentKey().get(), 0);
 Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path1");
 recordReader.nextKeyValue();
 Assert.assertEquals(recordReader.getCurrentKey().get(), 1);
 Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path2");
 Assert.assertFalse(recordReader.nextKeyValue());
}

代码示例来源:origin: apache/hbase

job.getConfiguration().getBoolean(SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_KEY,
                  SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_DEFAULT);
when(taskAttemptContext.getConfiguration()).thenReturn(job.getConfiguration());
RecordReader<ImmutableBytesWritable, Result> rr =
  tsif.createRecordReader(split, taskAttemptContext);
rr.initialize(split, taskAttemptContext);
while (rr.nextKeyValue()) {
 byte[] row = rr.getCurrentKey().get();
 verifyRowFromMap(rr.getCurrentKey(), rr.getCurrentValue());
 rowTracker.addRow(row);
rr.close();

代码示例来源:origin: apache/incubator-gobblin

@Override
public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException {
 if (!workUnitState.contains(FILE_SPLIT_BYTES_STRING_KEY)) {
  throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId());
 }
 Configuration configuration = new Configuration();
 FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, configuration);
 String fileSplitBytesStr = workUnitState.getProp(FILE_SPLIT_BYTES_STRING_KEY);
 FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr);
 TaskAttemptContext taskAttemptContext =
   getTaskAttemptContext(configuration, DummyTaskAttemptIDFactory.newTaskAttemptID());
 try {
  RecordReader<K, V> recordReader = fileInputFormat.createRecordReader(fileSplit, taskAttemptContext);
  recordReader.initialize(fileSplit, taskAttemptContext);
  boolean readKeys = workUnitState.getPropAsBoolean(FILE_INPUT_READ_KEYS_KEY, DEFAULT_FILE_INPUT_READ_KEYS);
  return getExtractor(workUnitState, recordReader, fileSplit, readKeys);
 } catch (InterruptedException ie) {
  throw new IOException(ie);
 }
}

代码示例来源:origin: org.apache.hadoop/hadoop-mapred-test

private static List<Text> readSplit(KeyValueTextInputFormat format, 
  InputSplit split, Job job) throws IOException, InterruptedException {
 List<Text> result = new ArrayList<Text>();
 Configuration conf = job.getConfiguration();
 TaskAttemptContext context = MapReduceTestUtil.
  createDummyMapTaskAttemptContext(conf);
 RecordReader<Text, Text> reader = format.createRecordReader(split, 
  MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
 MapContext<Text, Text, Text, Text> mcontext = 
  new MapContextImpl<Text, Text, Text, Text>(conf, 
  context.getTaskAttemptID(), reader, null, null,
  MapReduceTestUtil.createDummyReporter(), 
  split);
 reader.initialize(split, mcontext);
 while (reader.nextKeyValue()) {
  result.add(new Text(reader.getCurrentValue()));
 }
 return result;
}

代码示例来源:origin: org.neolumin.vertexium/vertexium-accumulo

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext ctx) throws IOException, InterruptedException {
  reader.initialize(inputSplit, ctx);
  Map configurationMap = VertexiumMRUtils.toMap(ctx.getConfiguration());
  this.graph = (AccumuloGraph) new GraphFactory().createGraph(MapUtils.getAllWithPrefix(configurationMap, "graph"));
  this.authorizations = new AccumuloAuthorizations(ctx.getConfiguration().getStrings(VertexiumMRUtils.CONFIG_AUTHORIZATIONS));
}

代码示例来源:origin: apache/avro

/**
  * Verifies that a non-null record reader can be created, and the key/value types are
  * as expected.
  */
 @Test
 public void testCreateRecordReader() throws IOException, InterruptedException {
  // Set up the job configuration.
  Job job = new Job();
  AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.STRING));
  Configuration conf = job.getConfiguration();

  FileSplit inputSplit = createMock(FileSplit.class);
  TaskAttemptContext context = createMock(TaskAttemptContext.class);
  expect(context.getConfiguration()).andReturn(conf).anyTimes();

  replay(inputSplit);
  replay(context);

  AvroKeyInputFormat inputFormat = new AvroKeyInputFormat();
  @SuppressWarnings("unchecked")
  RecordReader<AvroKey<Object>, NullWritable> recordReader = inputFormat.createRecordReader(
    inputSplit, context);
  assertNotNull(inputFormat);
  recordReader.close();

  verify(inputSplit);
  verify(context);
 }
}

代码示例来源:origin: larsgeorge/hbase-book

public Object[] getSample(InputFormat inf, Job job) throws IOException, InterruptedException {
 long counter = 0;
 List<InputSplit> splits = inf.getSplits(job);
 ArrayList<K> samples = new ArrayList<K>(numSamples);
 int splitsToSample = Math.min(maxSplitsSampled, splits.size());
   job.getConfiguration(), new TaskAttemptID());
  RecordReader<K, V> reader = inf.createRecordReader(splits.get(i), samplingContext);
  reader.initialize(splits.get(i), samplingContext);
  while (reader.nextKeyValue()) {
   if (r.nextDouble() <= freq) {
    if (samples.size() < numSamples) {
      LOG.info(String.format("Fill: Collected %d samples from %d splits", counter, i));
     counter++;
     samples.add(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null));
    } else {
      samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
       reader.getCurrentKey(), null));
      if (counter % 1000 == 0)
       LOG.info(String.format("Replace Random: Collected %d samples from %d splits", counter, i));
  reader.close();

代码示例来源:origin: apache/hive

Job job = new Job(jobConf);
TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), reporter);
 recordReader = createRecordReader(tableSplit, tac);
 try {
  recordReader.initialize(tableSplit, tac);
 } catch (InterruptedException e) {

代码示例来源:origin: apache/hive

@Override public boolean next(ImmutableBytesWritable rowKey, ResultWritable value) throws IOException {
  boolean next = false;
  try {
   next = recordReader.nextKeyValue();
   if (next) {
    rowKey.set(recordReader.getCurrentValue().getRow());
    value.setResult(recordReader.getCurrentValue());
   }
  } catch (InterruptedException e) {
   throw new IOException(e);
  }
  return next;
 }
};

相关文章