logstash不会从twitter api中过滤出json

wtlkbnrh  于 2021-06-13  发布在  ElasticSearch
关注(0)|答案(1)|浏览(387)

我想删除不必要的字段。其中有很多。我正在使用json过滤器插件进行日志存储,但它不能正常工作。它不想过滤数据或者只是不把它发送到输出。我试着用 mutate 但没有成功。例如,我想删除 entities 字段,它是顶级字段,但我的配置都不起作用。我还想删除一些嵌套字段。。。
以下是我的twitter api中的json示例:

{
  "retweet_count": 0,
  "created_at": "Mon Dec 14 18:43:09 +0000 2020",
  "place": null,
  "in_reply_to_user_id_str": null,
  "lang": "pl",
  "filter_level": "low",
  "possibly_sensitive": false,
  "id": 1338555139993591800,
  "id_str": "1338555139993591814",
  "quote_count": 0,
  "is_quote_status": false,
  "geo": null,
  "entities": {
    "symbols": [],
    "user_mentions": [],
    "urls": [
      {
        "indices": [
          117,
          140
        ],
        "url": "xxx",
        "expanded_url": "xxx"
      }
    ],
    "hashtags": [
      {
        "text": "koronawirus",
        "indices": [
          84,
          96
        ]
      },
      {
        "text": "COVID19",
        "indices": [
          97,
          105
        ]
      },
      {
        "text": "Lockdown",
        "indices": [
          106,
          115
        ]
      }
    ]
  },
  "timestamp_ms": "1607971389183",
  "reply_count": 0,
  "retweeted": false,
  "text": "W Wielkiej Brytanii wykryto nowy wariant koronawirusa. Kolejne kraje z lockdownem­čĹç\n\n#koronawirus #COVID19 #Lockdown\n\nxxx",
  "contributors": null,
  "truncated": false,
  "in_reply_to_user_id": null,
  "source": "<a href=\"xxx">Twitter Web App</a>",
  "@timestamp": "2020-12-14T18:43:09.000Z",
  "in_reply_to_screen_name": null,
  "favorited": false,
  "in_reply_to_status_id": null,
  "user": {
    "created_at": "Tue May 12 09:11:01 +0000 2009",
    "profile_use_background_image": false,
    "lang": null,
    "contributors_enabled": false,
    "profile_text_color": "000000",
    "id": 39464882,
    "id_str": "39464882",
    "following": null,
    "geo_enabled": false,
    "profile_sidebar_fill_color": "000000",
    "is_translator": false,
    "protected": false,
    "profile_image_url": "xxx",
    "profile_link_color": "3B94D9",
    "name": "Salon24.pl",
    "profile_sidebar_border_color": "000000",
    "favourites_count": 309,
    "profile_background_image_url": "xxx",
    "followers_count": 17473,
    "description": null,
    "location": "Polska",
    "url": "xxx",
    "profile_background_color": "000000",
    "utc_offset": null,
    "profile_background_image_url_https": "xxx",
    "default_profile": false,
    "follow_request_sent": null,
    "verified": false,
    "translator_type": "none",
    "friends_count": 1028,
    "time_zone": null,
    "default_profile_image": false,
    "screen_name": "Salon24pl",
    "profile_image_url_https": "xxx",
    "statuses_count": 48490,
    "notifications": null,
    "listed_count": 203,
    "profile_background_tile": false
  },
  "in_reply_to_status_id_str": null,
  "favorite_count": 0,
  "@version": "1",
  "coordinates": null
}

这是我的实际配置:

input {
  twitter {
      id => "logstash_to_kafka_plugin"
      consumer_key => "xxx"
      consumer_secret => "xxx"
      oauth_token => "xxx"
      oauth_token_secret => "xxx"
      keywords => [ "koronawirus" ]
      full_tweet => true
      ignore_retweets => true     
    }   
  }

filter {
    json {
        source => "message"
        remove_field => [ "[message][entities]"]
        }
}

output {
      kafka {
        codec => json
        topic_id => "twitter_tweets"
      }
    }

我尝试了不同的方法来表示这个字段,比如:

remove_field => [ "entities" ] or
remove_field => [ "[entities]" ]

但这也不管用。

ldxq2e6h

ldxq2e6h1#

尝试在json filter块之后添加一个mutate filter with remove\ field,这样新的mutate filter会在字段通过json filter被cretae到根之后执行。你的过滤器看起来像

filter {
    json {
        source => "message" 
    }
    mutate {
        remove_field => ["entities", "user.created_at"] // works for nested field as well
    }
}

相关问题