
x33g5p2x  于2022-01-17 转载在 其他  



[英]Builds a minimal FST (maps an IntsRef term to an arbitrary output) from pre-sorted terms with outputs. The FST becomes an FSA if you use NoOutputs. The FST is written on-the-fly into a compact serialized format byte array, which can be saved to / loaded from a Directory or used directly for traversal. The FST is always finite (no cycles).

NOTE: The algorithm is described at

The parameterized type T is the output type. See the subclasses of Outputs.

FSTs larger than 2.1GB are now possible (as of Lucene 4.2). FSTs containing more than 2.1B nodes are also now possible, however they cannot be packed.
现在可以使用大于2.1GB的FST(从Lucene 4.2开始)。现在也可以使用包含2.1B个以上节点的FST,但它们不能打包。


代码示例来源:origin: org.apache.lucene/lucene-core

final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
                           0, 0, true, false, Integer.MAX_VALUE,
                           outputs, true, 15);
assert bytes.length > 0;
scratchBytes.writeTo(bytes, 0);
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
index = indexBuilder.finish();

代码示例来源:origin: org.apache.lucene/lucene-core

/** Returns final FST.  NOTE: this will return null if
 *  nothing is accepted by the FST. */
public FST<T> finish() throws IOException {
 final UnCompiledNode<T> root = frontier[0];
 // minimize nodes in the last word's suffix
 if (root.inputCount < minSuffixCount1 || root.inputCount < minSuffixCount2 || root.numArcs == 0) {
  if (fst.emptyOutput == null) {
   return null;
  } else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) {
   // empty string got pruned
   return null;
 } else {
  if (minSuffixCount2 != 0) {
   compileAllTargets(root, lastInput.length());
 //if (DEBUG) System.out.println("  builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
 fst.finish(compileNode(root, lastInput.length()).node);
 return fst;

代码示例来源:origin: org.apache.lucene/lucene-core

assert lastInput.length() == 0 || input.compareTo(lastInput.get()) >= 0: "inputs are added out of order lastInput=" + lastInput.get() + " vs input=" + input;
assert validOutput(output);
 assert validOutput(lastOutput);
  assert validOutput(commonOutputPrefix);
  wordSuffix = fst.outputs.subtract(lastOutput, commonOutputPrefix);
  assert validOutput(wordSuffix);
  parentNode.setLastOutput(input.ints[input.offset + idx - 1], commonOutputPrefix);
 assert validOutput(output);

代码示例来源:origin: org.apache.lucene/lucene-core

private void freezeTail(int prefixLenPlus1) throws IOException {
 for(int idx=lastInput.length(); idx >= downTo; idx--) {
   parent.deleteLast(lastInput.intAt(idx-1), node);
  } else {
    compileAllTargets(node, lastInput.length()-idx);
              compileNode(node, 1+lastInput.length()-idx),

代码示例来源:origin: org.apache.lucene/lucene-analyzers-kuromoji

Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = 0;
 for (int i = 0; i < token.length(); i++) {
  scratch.setIntAt(i, (int) token.charAt(i));
 fstBuilder.add(scratch.get(), ord);
this.fst = new TokenInfoFST(fstBuilder.finish(), false); = data.toArray(new String[data.size()]);
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);

代码示例来源:origin: org.elasticsearch/elasticsearch

public void finishTerm(long defaultWeight) throws IOException {
  ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
  int deduplicator = 0;
  analyzed.append((byte) 0);
  analyzed.setLength(analyzed.length() + 1);
  for (int i = 0; i < count; i++) {
    analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++);
    Util.toIntsRef(analyzed.get(), scratchInts);
    SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
    long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
    builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload));
  count = 0;

代码示例来源:origin: org.apache.lucene/lucene-core

private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
  while((indexEnt = != null) {
   //if (DEBUG) {
   //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
   builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);

代码示例来源:origin: org.apache.lucene/lucene-analyzers-common

IntsRefBuilder scratchInts = new IntsRefBuilder();
 IntsRefBuilder currentOrds = new IntsRefBuilder();
    Util.toUTF32(currentEntry, scratchInts);
    words.add(scratchInts.get(), currentOrds.get());
 Util.toUTF32(currentEntry, scratchInts);
 words.add(scratchInts.get(), currentOrds.get());
 success2 = true;
} finally {

代码示例来源:origin: org.apache.lucene/lucene-analyzers-common

Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, b);
words = b.finish();

代码示例来源:origin: org.apache.lucene/lucene-spellchecker

final Object empty = outputs.getNoOutput();
final Builder<Object> builder = 
 new Builder<Object>(FST.INPUT_TYPE.BYTE4, outputs);
final IntsRef scratchIntsRef = new IntsRef(10);
for (Entry e : entries) {
  ints[i] = chars[i];
 builder.add(scratchIntsRef, empty);
return builder.finish();

代码示例来源:origin: org.elasticsearch/elasticsearch

public FST<Pair<Long, BytesRef>> build() throws IOException {
  return builder.finish();

代码示例来源:origin: org.elasticsearch/elasticsearch

public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) {
  this.payloadSep = payloadSep;
  this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
  this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
  this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
  this.hasPayloads = hasPayloads;
  surfaceFormsAndPayload = new SurfaceFormAndPayload[maxSurfaceFormsPerAnalyzedForm];
public void startTerm(BytesRef analyzed) {

代码示例来源:origin: org.apache.lucene/lucene-codecs

public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
 this.fieldInfo = fieldInfo;
 fstOutputs = PositiveIntOutputs.getSingleton();
 fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
 indexStart = out.getFilePointer();
 ////System.out.println("VGW: field=" +;
 // Always put empty string in
 fstBuilder.add(new IntsRef(), termsFilePointer);
 startTermsFilePointer = termsFilePointer;

代码示例来源:origin: org.apache.lucene/lucene-core

private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
 for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
  final Arc<T> arc = node.arcs[arcIdx];
  if (! {
   // not yet compiled
   @SuppressWarnings({"rawtypes","unchecked"}) final UnCompiledNode<T> n = (UnCompiledNode<T>);
   if (n.numArcs == 0) {
    //System.out.println("seg=" + segment + "        FORCE final arc=" + (char) arc.label);
    arc.isFinal = n.isFinal = true;
   } = compileNode(n, tailLength-1);

代码示例来源:origin: org.apache.lucene/lucene-analyzers-nori

Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
 for (int i = 0; i < token.length(); i++) {
  scratch.setIntAt(i, (int) token.charAt(i));
 fstBuilder.add(scratch.get(), ord);
 lastToken = token;
 ord ++;
this.fst = new TokenInfoFST(fstBuilder.finish());
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
this.rightIds = new short[rightIds.size()];

代码示例来源:origin: com.strapdata.elasticsearch/elasticsearch

public void finishTerm(long defaultWeight) throws IOException {
  ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
  int deduplicator = 0;
  analyzed.append((byte) 0);
  analyzed.setLength(analyzed.length() + 1);
  for (int i = 0; i < count; i++) {
    analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++);
    Util.toIntsRef(analyzed.get(), scratchInts);
    SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
    long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
    builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload));
  count = 0;

代码示例来源:origin: org.infinispan/infinispan-embedded-query

private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
  while((indexEnt = != null) {
   //if (DEBUG) {
   //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
   builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);

代码示例来源:origin: org.infinispan/infinispan-embedded-query

Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
readDictionaryFiles(dictionaries, decoder, b);
words = b.finish();

代码示例来源:origin: com.strapdata.elasticsearch/elasticsearch

public FST<Pair<Long, BytesRef>> build() throws IOException {
  return builder.finish();

代码示例来源:origin: org.apache.lucene/lucene-codecs

public TermsWriter(IndexOutput out, FieldInfo field) {
 this.out = out;
 this.field = field;
 builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
