[英]A machine learning "example" to be used in training, testing or performance of various machine learning algorithms.
An instance contains four generic fields of predefined name: "data", "target", "name", and "source". "Data" holds the data represented `by the instance, "target" is often a label associated with the instance, "name" is a short identifying name for the instance (such as a filename), and "source" is human-readable sourceinformation, (such as the original text).
Each field has no predefined type, and may change type as the instance is processed. For example, the data field may start off being a string that represents a file name and then be processed by a cc.mallet.pipe.Pipe into a CharSequence representing the contents of the file, and eventually to a feature vector holding indices into an cc.mallet.types.Alphabet holding words found in the file. It is up to each pipe which fields in the Instance it modifies; the most common case is that the pipe modifies the data field.
Generally speaking, there are two modes of operation for Instances. (1) An instance gets created and passed through a Pipe, and the resulting data/target/name/source fields are used. This is generally done for training instances. (2) An instance gets created with raw values in its slots, then different users of the instance call newPipedCopy() with their respective different pipes. This might be done for test instances at "performance" time.
Rather than store an cc.mallet.types.Alphabet in the Instance, we obtain it through the Pipe instance variable, because the Pipe also indicates where the data came from and how to interpret the Alphabet.
Instances can be made immutable if locked. Although unlocked Instances are mutable, typically the only code that changes the values in the four slots is inside Pipes.
Note that constructing an instance with a pipe argument means "Construct the instance and then run it through the pipe". cc.mallet.types.InstanceList uses this method when adding instances through a pipeInputIterator.
一般来说,实例有两种操作模式。(1) 创建一个实例并通过管道传递,然后使用生成的数据/目标/名称/源字段。这通常是针对培训实例进行的。(2) 实例在其插槽中使用原始值创建,然后实例的不同用户使用各自不同的管道调用newPipedCopy()。这可以在“性能”时对测试实例执行。
代码示例来源:origin: de.julielab/jcore-mallet-2.0.9
public Instance pipe (Instance carrier) {
if (carrier.getData() instanceof CharSequence) {
CharSequence data = (CharSequence) carrier.getData();
else {
throw new IllegalArgumentException("CharSequenceLowercase expects a CharSequence, found a " + carrier.getData().getClass());
return carrier;
代码示例来源:origin: de.julielab/jcore-mallet-2.0.9
public Instance next ()
Instance inst = subIt.next ();
inst = pipe.pipe (inst);
return new Instance (inst.getData (), inst.getTarget (), inst.getName (), inst.getSource ());
代码示例来源:origin: com.github.steveash.mallet/mallet
public Instance pipe (Instance carrier) {
if (carrier.getTarget() != null) {
if (! (carrier.getTarget() instanceof String)) {
throw new IllegalArgumentException ("Target must be a string for conversion to Double");
carrier.setTarget( new Double((String) carrier.getTarget()) );
return carrier;
代码示例来源:origin: cc.mallet/mallet
public Instance pipe (Instance carrier)
carrier.setSource (carrier.getData());
return carrier;
代码示例来源:origin: cc.mallet/mallet
public Instance pipe (Instance carrier)
if (prefix != null)
System.out.print (prefix);
String targetString = "<null>";
if (carrier.getTarget() != null)
targetString = carrier.getTarget().toString();
System.out.println ("name: " + carrier.getName() +
"\ntarget: " + targetString +
"\ninput: " + carrier.getData() // Swapping order, since data often has a newline at the end -DM
return carrier;
代码示例来源:origin: com.github.steveash.mallet/mallet
public Instance next () {
if (currentIndex >= currentTokenSequence.size()) {
currentInstance = source.next();
currentTokenSequence = (TokenSequence) currentInstance.getData();
Instance ret = new Instance (currentTokenSequence.get(currentIndex),
null, null);
return ret;
public boolean hasNext () {
代码示例来源:origin: com.github.steveash.jg2p/jg2p-core
public Instance pipe(Instance inst) {
List<String> source = (List<String>) inst.getData();
if (inst.getTarget() != null && updateTarget) {
List<String> target = (List<String>) inst.getTarget();
Preconditions.checkState(target.size() == source.size(), "target %s source %s", target, source);
return inst;
代码示例来源:origin: cc.mallet/mallet
public Instance shallowCopy ()
Instance ret = new Instance (data, target, name, source);
ret.locked = locked;
ret.properties = properties;
return ret;
代码示例来源:origin: de.julielab/jcore-mallet-2.0.9
private Tokenization doTokenize (Object obj)
Instance toked = new Instance (obj, null, null, null);
tokenizationPipe.pipe (toked);
return (Tokenization) toked.getData ();
代码示例来源:origin: cc.mallet/mallet
public void unhideAllLabels ()
for (int i = 0; i < this.size(); i++) {
Instance instance = this.get(i);
Object t;
if (instance.getTarget() == null && (t=instance.getProperty(TARGET_PROPERTY)) != null) {
代码示例来源:origin: cc.mallet/mallet
private static void printTrialClassification(Trial trial)
for (Classification c : trial) {
Instance instance = c.getInstance();
System.out.print(instance.getName() + " " + instance.getTarget() + " ");
Labeling labeling = c.getLabeling();
for (int j = 0; j < labeling.numLocations(); j++){
System.out.print(labeling.getLabelAtRank(j).toString() + ":" + labeling.getValueAtRank(j) + " ");
代码示例来源:origin: cc.mallet/mallet
public void addInstantiatedCliques (UnrolledGraph graph, Instance instance)
FeatureVectorSequence fvs = (FeatureVectorSequence) instance.getData ();
LabelsAssignment lblseq = (LabelsAssignment) instance.getTarget ();
addInstantiatedCliques (graph, fvs, lblseq);
代码示例来源:origin: cc.mallet/mallet
public void testOne () {
Iterator iter = new PatternMatchIterator( data, Pattern.compile("<p>(.+?)</p>", Pattern.DOTALL));
int i=0;
while (iter.hasNext()) {
Instance inst = (Instance) iter.next();
System.out.println( inst.getName() + " : " + inst.getData() );
if (i++==0)
assertTrue (inst.getData().equals("Inside inside inside"));
assertTrue (inst.getData().equals("inside\ninside"));
代码示例来源:origin: cc.mallet/mallet
public String[] getDocumentNames() {
String[] docNames = new String[ data.size() ];
for (int doc = 0; doc < docNames.length; doc++) {
docNames[doc] = (String) data.get(doc).instance.getName();
return docNames;
代码示例来源:origin: de.julielab/jcore-mallet-2.0.9
/** Sets the "target" field to <code>null</code> in all instances. This makes unlabeled data. */
public void removeTargets()
for (Instance instance : this)
instance.setTarget (null);
代码示例来源:origin: cc.mallet/mallet
public InstanceWithConfidence (Instance inst, double c, Sequence predicted) {
this.instance = inst;
this.confidence = c;
this.correct = true;
Sequence truth = (Sequence) inst.getTarget ();
for (int i=0; i < truth.size(); i++) {
if (!truth.get(i).equals (predicted.get(i))) {
this.correct = false;
代码示例来源:origin: de.julielab/jcore-mallet-2.0.9
public Instance pipe (Instance carrier)
if (prefix != null)
System.out.print (prefix);
String targetString = "<null>";
if (carrier.getTarget() != null)
targetString = carrier.getTarget().toString();
System.out.println ("name: " + carrier.getName() +
"\ntarget: " + targetString +
"\ninput: " + carrier.getData() // Swapping order, since data often has a newline at the end -DM
return carrier;
代码示例来源:origin: cc.mallet/mallet
public Instance next () {
if (currentIndex >= currentTokenSequence.size()) {
currentInstance = source.next();
currentTokenSequence = (TokenSequence) currentInstance.getData();
Instance ret = new Instance (currentTokenSequence.get(currentIndex),
null, null);
return ret;
public boolean hasNext () {
代码示例来源:origin: de.julielab/jcore-mallet-2.0.9
public Instance shallowCopy ()
Instance ret = new Instance (data, target, name, source);
ret.locked = locked;
ret.properties = properties;
return ret;
代码示例来源:origin: cc.mallet/mallet
private Tokenization doTokenize (Object obj)
Instance toked = new Instance (obj, null, null, null);
tokenizationPipe.pipe (toked);
return (Tokenization) toked.getData ();