import sys
import xmltodict
import json
from elasticsearch import Elasticsearch
INDEX="xmlfiles"
TYPE= "record"
def xml_to_actions(xmlcontent):
for record in xmlcontent["records"]:
yield ('{ "index" : { "_index" : "%s", "_type" : "%s" }}'% (INDEX, TYPE))
yield (json.dumps(record, default=int))
e = Elasticsearch() # no args, connect to localhost:9200
if not e.indices.exists(INDEX):
raise RuntimeError('index does not exists, use `curl -X PUT "localhost:9200/%s"` and try again'%INDEX)
for f in sys.argv:
with open(f, "rt") as fin:
r = e.bulk(xml_to_actions(xmldict.parse(fin))) # return a dict
print(f, not r["errors"])
1条答案
按热度按时间krugob8w1#
对于python3,我建议使用xmltodict
跑
pip install xmltodict elasticsearch
我假设xml文件有记录:所以他们必须被分成记录。
使用以下内容编辑名为“load.py”的脚本:
将其用于:
python load.py xml1.xml xml2.xml ... xml20.xml