CoreNLP ner.applyFineGrained和PERSON实体注解

2ledvvac 于 6个月前发布在其他

关注(0)|答案(5)|浏览(94)

当使用 ner.applyFineGrained 设置为 true 时，NER 标注器在某些情况下会感到困惑，例如在这个短语中：

George Washington went to Washington

在这种情况下，术语 George 将在输出中具有任何注解，即 O 值：

{
	"sentences": [{
				"index": 0,
				"text": "George Washington went to Washington",
				"line": 1,
				"sentimentValue": "1",
				"tokens": [{
						"index": 1,
						"word": "George",
						"characterOffsetBegin": 0,
						"characterOffsetEnd": 6,
						"before": "",
						"after": " ",
						"pos": "NNP",
						"ner": "O",
						"lemma": "George"
					},
					{
						"index": 2,
						"word": "Washington",
						"characterOffsetBegin": 7,
						"characterOffsetEnd": 17,
						"before": " ",
						"after": " ",
						"pos": "NNP",
						"ner": "STATE_OR_PROVINCE"
					},
					{
						"index": 3,
						"word": "went",
						"characterOffsetBegin": 18,
						"characterOffsetEnd": 22,
						"before": " ",
						"after": " ",
						"pos": "VBD",
						"ner": "O"
					},
					{
						"index": 4,
						"word": "to",
						"characterOffsetBegin": 23,
						"characterOffsetEnd": 25,
						"before": " ",
						"after": " ",
						"pos": "TO",
						"ner": "O"
					},
					{
						"index": 5,
						"word": "Washington",
						"characterOffsetBegin": 26,
						"characterOffsetEnd": 36,
						"before": " ",
						"after": "",
						"pos": "NNP",
						"ner": "STATE_OR_PROVINCE"
					}
				]
			}

而当设置为 false 时，标注器将正确检测到 NER George ,因此输出将如下所示：

{
	"sentences": [{
		"index": 0,
		"text": "George Washington went to Washington",
		"line": 1,
		"sentimentValue": "1",
		"tokens": [{
				"index": 1,
				"word": "George",
				"characterOffsetBegin": 0,
				"characterOffsetEnd": 6,
				"before": "",
				"after": " ",
				"pos": "NNP",
				"ner": "PERSON",
				"lemma": "George",
				"phoneme": "ʤɔˈɹʤ",
			},
			{
				"index": 2,
				"word": "Washington",
				"characterOffsetBegin": 7,
				"characterOffsetEnd": 17,
				"before": " ",
				"after": " ",
				"pos": "NNP",
				"ner": "PERSON",
				"lemma": "Washington",
			},
			{
				"index": 3,
				"word": "went",
				"characterOffsetBegin": 18,
				"characterOffsetEnd": 22,
				"before": " ",
				"after": " ",
				"pos": "VBD",
				"ner": "O",
				"lemma": "go"
			},
			{
				"index": 4,
				"word": "to",
				"characterOffsetBegin": 23,
				"characterOffsetEnd": 25,
				"before": " ",
				"after": " ",
				"pos": "TO",
				"ner": "O",
				"lemma": "to"
			},
			{
				"index": 5,
				"word": "Washington",
				"characterOffsetBegin": 26,
				"characterOffsetEnd": 36,
				"before": " ",
				"after": "",
				"pos": "NNP",
				"ner": "LOCATION",
				"lemma": "Washington"
			}
		]
	}]
}

这种行为有任何原因吗？

CoreNLP

来源：https://github.com/stanfordnlp/CoreNLP/issues/828

5条答案

按热度按时间

kq4fsx7k1#

我无法复现这个错误(使用3.9.2或GitHub最新代码)。您能提供更多关于上下文的详细信息吗？
我使用的命令是：

java -Xmx10g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner -ner.applyFineGrained -file example.txt -outputFormat text

赞(0）回复(0）举报 6个月前

bis0qfac2#

@J38 非常感谢你的调试。我在代码中深入挖掘了一下，发现这种情况发生在特定的用例中：

实体由多个标记组成(因此是 George Washington)
我们使用 ner.applyFineGrained 与我们的自定义标注器一起使用，该标注器扩展了 SentenceAnnotator,并使用 NERClassifierCombiner 来识别我们定义的新实体类型 ARTIST。
当给定文本 George went to Washington, Rihanna is an artist 时，当实体是一个单独的标记(因此是 George)时，它按预期工作：我们识别基本的 PERSON 实体和我们的 ARTIST 实体：

"annotations": {
    "sentences": [
      {
        "index": 0,
        "text": "George went to Washington, Rihanna is an artist",
        "line": 1,
        "structure": "A0",
        "paragraphIndex": 0,
        "paragraphStructure": "A0",
        "tokens": [
          {
            "index": 1,
            "word": "George",
            "characterOffsetBegin": 0,
            "characterOffsetEnd": 6,
            "before": "",
            "after": " ",
            "pos": "NNP",
            "ner": "PERSON",
            "lemma": "George",
            "snippet": "George went to Washington, Rihanna is an artist",
            "entityDelimiter": "U"
          },
          ...
          {
            "index": 4,
            "word": "Washington",
            "characterOffsetBegin": 15,
            "characterOffsetEnd": 25,
            "before": " ",
            "after": "",
            "pos": "NNP",
            "ner": "STATE_OR_PROVINCE",
            "lemma": "Washington",
            "snippet": "George went to Washington, Rihanna is an artist",
            "entityDelimiter": "U"
          },
          ...
          {
            "index": 6,
            "word": "Rihanna",
            "characterOffsetBegin": 27,
            "characterOffsetEnd": 34,
            "before": " ",
            "after": " ",
            "pos": "NNP",
            "ner": "ARTIST",
            "lemma": "Rihanna",
            "mxmID": "33491890",
            "snippet": "George went to Washington, Rihanna is an artist",
            "entityDelimiter": "U"
          },
...
    ],

在这种情况下，我们运行 ner.fine.regexner.mapping" 的配置：

"ner.applyFineGrained": true,
        "ner.fine.regexner.mapping": "header=true,mxm_nlpdata/mxm_casedentities.tab;ignorecase=true,edu/stanford/nlp/models/kbp/regexner_caseless.tab;edu/stanford/nlp/models/kbp/regexner_cased.tab;ignorecase=true,header=true, mxm_nlpdata/mxm_entities.tab;ignorecase=true,header=true, mxm_nlpdata/mxm_artists.tab;mxm_nlpdata/mxm_labels.tab;ignorecase=true, mxm_nlpdata/mxm_blacklist.tab"

所以似乎当我们的自定义 SentenceAnnotator 覆盖 annotate 方法时会失败：

@Override
	public void annotate(Annotation annotation) {
		if (VERBOSE) {
			log.info("Adding NER Combiner annotation ... ");
		}

		// if ner.usePresentDateForDocDate is set, use the present date as the doc date
		if (usePresentDateForDocDate) {
			String currentDate =
					new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime());
			annotation.set(CoreAnnotations.DocDateAnnotation.class, currentDate);
		}
		// use provided doc date if applicable
		if (!providedDocDate.equals("")) {
			annotation.set(CoreAnnotations.DocDateAnnotation.class, providedDocDate);
		}
		
		
		
		AnnotationsMask mask = new AnnotationsMask(true);

		Annotation maskedAnnotation = mask.decompose(annotation);
		

		super.annotate(maskedAnnotation);
		this.ner.finalizeAnnotation(maskedAnnotation);

		if (VERBOSE) {
			log.info("done.");
		}
		// if Spanish, run the regexner with Spanish number rules
		if (LanguageInfo.HumanLanguage.SPANISH.equals(language))
			spanishNumberAnnotator.annotate(maskedAnnotation);
		// if fine grained ner is requested, run that
		if (this.applyFineGrained) {
			fineGrainedNERAnnotator.annotate(maskedAnnotation);
			// set the FineGrainedNamedEntityTagAnnotation.class
			for (CoreLabel token : maskedAnnotation.get(CoreAnnotations.TokensAnnotation.class)) {
				String fineGrainedTag = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
				token.set(CoreAnnotations.FineGrainedNamedEntityTagAnnotation.class, fineGrainedTag);
			}
		}
		// if entity mentions should be built, run that
		if (this.buildEntityMentions)
			entityMentionsAnnotator.annotate(maskedAnnotation);
		
		
		Map<Class, Object> mapped_defaults = new HashMap<>();

		mapped_defaults.put(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
		mapped_defaults.put(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, null);
		mapped_defaults.put(MXMCoreAnnotations.MXMSlangCorrectionAnnotation.class, null);
		mapped_defaults.put(MXMCoreAnnotations.MXMEntityID.class, null);
		mapped_defaults.put(CoreAnnotations.LinkAnnotation.class, null);
		mapped_defaults.put(CoreAnnotations.ValueAnnotation.class, null);
		mapped_defaults.put(TimeExpression.Annotation.class, null);
		mapped_defaults.put(TimeExpression.TimeIndexAnnotation.class, null);
		mapped_defaults.put(CoreAnnotations.DistSimAnnotation.class, null);
		mapped_defaults.put(CoreAnnotations.NumericCompositeTypeAnnotation.class, null);
		mapped_defaults.put(TimeExpression.ChildrenAnnotation.class, null);
		mapped_defaults.put(CoreAnnotations.NumericTypeAnnotation.class, null);
		mapped_defaults.put(CoreAnnotations.ShapeAnnotation.class, null);
		mapped_defaults.put(Tags.TagsAnnotation.class, null);
		mapped_defaults.put(CoreAnnotations.NumerizedTokensAnnotation.class, null);
		mapped_defaults.put(CoreAnnotations.AnswerAnnotation.class, null);
		mapped_defaults.put(CoreAnnotations.NumericCompositeValueAnnotation.class, null);
		mapped_defaults.put(CoreAnnotations.CoarseNamedEntityTagAnnotation.class, null);
		mapped_defaults.put(CoreAnnotations.FineGrainedNamedEntityTagAnnotation.class, null);

		
		annotation = mask.recompose(annotation, maskedAnnotation, mapped_defaults);

	}

赞(0）回复(0）举报 6个月前

qhhrdooz3#

你能给我展示一下管道设置吗？你创建了一个统计模型来标记"ARTIST"吗？
另外，这里是NER过程的最新撰写内容，其中对每个步骤都非常详细：
https://stanfordnlp.github.io/CoreNLP/ner.html

赞(0）回复(0）举报 6个月前

pprl5pva4#

当然，我的配置如下：

var options = {

        "lang": "en",
        
        "annotators": "tokenize,mxmssplit,mxmslang,mxmphonetics,mxmsegmenter,mxmpos,mxmlemma,mxmner,mxmsentiment",
        
        // POS
        "customAnnotatorClass.mxmpos": "musixmatch_nlp.MXMPartOfSpeechAnnotator",

        // LEMMATIZER
        "customAnnotatorClass.mxmlemma": "musixmatch_nlp.MXMMorphaAnnotator",

        // PHONEMES
        "customAnnotatorClass.mxmphonetics": "musixmatch_nlp.MXMPhoneticsAnnotator",

        // SEGMENTER
        "customAnnotatorClass.mxmsegmenter": "musixmatch_nlp.MXMLyricsSegmenterAnnotator",

        // SLANG
        "customAnnotatorClass.mxmslang": "musixmatch_nlp.MXMSlangCorrector",

        // NER
        "customAnnotatorClass.mxmner": "musixmatch_nlp.MXMNERCombinerAnnotator",

        // SPLIT
        "customAnnotatorClass.mxmssplit": "musixmatch_nlp.MXMWordToSentencesAnnotator",

        // SENTIMENT
        "customAnnotatorClass.mxmsentiment": "musixmatch_nlp.MXMSentimentTensorflowAnnotator",

        "mxmphonetics.ipa_dict": "/root/en_cmuipadict.txt",
        "mxmsentiment.model_dir": "/root/blstm_att1530026090",
        "mxmslang.language": "en",
        "ssplit.newlineIsSentenceBreak": "always",
        
        "ner.applyFineGrained": true,
        "ner.buildEntityMentions": false,
        
        "ner.fine.regexner.mapping": "header=true,mxm_nlpdata/mxm_casedentities.tab;ignorecase=true,edu/stanford/nlp/models/kbp/regexner_caseless.tab;edu/stanford/nlp/models/kbp/regexner_cased.tab;ignorecase=true,header=true, mxm_nlpdata/mxm_entities.tab;ignorecase=true,header=true, mxm_nlpdata/mxm_artists.tab;mxm_nlpdata/mxm_labels.tab;ignorecase=true, mxm_nlpdata/mxm_blacklist.tab"
        
        
    
    };

我们在这里有几个类扩展，而与NER分类器相关的重要的内容是 mxmner 及其配置 "musixmatch_nlp.MXMNERCombinerAnnotator" 。
您可以在上面找到实现 MXMNERCombinerAnnotator 的Java类，该类扩展了 SentenceAnnotator 。
基本上，它通常可以正常工作并标记新的 ARTIST 标签。在上述情况下出现多个标记时，它会失败。