在MongoDB/PyMongo中计算外部数组中值的出现次数

6ss1mwsb  于 2022-12-12  发布在  Go
关注(0)|答案(1)|浏览(178)

我有两个系列:
会议记录:

{
      "_id": {
        "$oid": "638d07005bdfe572e95b9282"
      },
      "key": "reference/genetic/2015",
      "editor": [
        "Amir Hossein Gandomi",
        "Amir Hossein Alavi",
        "Conor Ryan"
      ],
      "title": "Handbook of Genetic Programming Applications",
      "booktitle": {
        "$numberDouble": "NaN"
      },
      "publisher": "Springer",
      "volume": {
        "$numberDouble": "NaN"
      },
      "year": "2015"
    }

在诉讼中:

{
      "_id": {
        "$oid": "638d06b85bdfe572e92b7567"
      },
      "key": "conf/coopis/ChenD00",
      "author": [
        "Qiming Chen",
        "Umeshwar Dayal"
      ],
      "title": "Multi-Agent Cooperative Transactions for E-Commerce.",
      "pages": "311-322",
      "year": "2000",
      "booktitle": "CoopIS"
    }

我需要计算一个人在前面的集合中作为编辑出现的次数,并将其与他们的名字在后面的集合中作为作者出现的次数相加。
这就是我所拥有的:

m6 = proceeding_collection.aggregate([
        {
            "$unwind": "$editor"
        },
        {
            "$match": {
                "editor": { "$ne": numpy.NaN }
            }
        },
        {
            "$group": {
                "_id": "$editor",
                "count": { "$sum": 1 }
            }
        },
        {
            "$lookup": {
                "from": "inproceedings",
                "let": {"editor": "$_id"},
                "pipeline": 
                [
                    {
                        "$unwind": "$author"
                    },
                    {
                        "$match":
                            {
                                "$expr": {
                                    "$in": 
                                        ["$$editor", ["$author"]]
                                }
                            }
                        
                    },
                ],
                "as": "inproceedings"
            }
        },
        {
            "$project": {
                "_id": 1,
                "count": 1,
                "inproceedings_count": { "$size": "$inproceedings" }
            }
        },
        {
            "$addFields": {
                "total_count": { "$sum": ["$count", "$inproceedings_count"] }
            }
        },
        {
            "$sort": {
                "total_count": -1
            }
        },
        {
            "$limit": 10
        }
    ])
    for doc in m6:
        print(doc)

第一部分工作得很好,"editor"字段可以是数组或单个字符串值,但外部集合中的"author"字段也是如此。因此,我尝试在管道中展开数组,并基于编辑器名称进行匹配,但在进程中计数始终为零。

brjng4g3

brjng4g31#

当你在执行单条件绝对连接时,你可以使用$lookup的简单版本。https://mongoplayground.net/p/pRM-sbQwSp4

db.proceeding_collection.aggregate([
  {
    $unwind: "$editor"
  },
  {
    $group: {
      _id: "$editor",
      editorCount: {
        $sum: 1
      }
    }
  },
  {
    $lookup: {
      from: "inproceedings",
      localField: "_id",
      foreignField: "author",
      as: "inproceedings"
    }
  },
  {
    $project: {
      _id: 0,
      name: "$_id",
      editorCount: 1,
      authorCount: {
        $size: "$inproceedings"
      },
      totalCount: {
        $add: [
          "$editorCount",
          {
            $size: "$inproceedings"
          }
        ]
      }
    }
  },
  {
    $sort: {
      totalCount: -1
    }
  }
])
  • 为简洁起见,省略了一些限制/匹配语句。*
    其他协助:

我使用mgodatagen生成了3 M个会议记录和50 k个会议记录,并在author上添加了一个索引。(我在editor上添加索引是错误的,如果你想做一个初始的$project,它不会加速任何事情。无论如何,mgodatagen在这里:

[
  {
    "database": "local",
    "collection": "proceeding_collection",
    "count": 3000000,
    "content": {
      "key": {
        "type": "stringFromParts",
        "parts": [
          {
            "type": "enum",
            "values": [
              "root1",
              "root2",
              "root3"
            ]
          },
          {
            "type": "constant",
            "constVal": "/"
          },
          {
            "type": "enum",
            "values": [
              "mid1",
              "mid2",
              "mid3"
            ]
          },
          {
            "type": "constant",
            "constVal": "/"
          },
          {
            "type": "enum",
            "values": [
              "end1",
              "end2",
              "end3"
            ]
          }
        ]
      },
      "editor": {
        "type": "array",
        "arrayContent": {
          "type": "string",
          "minLength": 1,
          "maxLength": 1
        }
      },
      "title": {
        "type": "string",
        "minLength": 5,
        "maxLength": 10
      },
      "booktitle": {
        "type": "double",
        "min": 100,
        "max": 1000
      },
      "publisher": {
        "type": "enum",
        "values": [
          "Publisher 1",
          "Publisher 2",
          "Publisher 3",
          "Publisher 4",
          "Publisher 5"
        ]
      },
      "volume": {
        "type": "double",
        "min": 100,
        "max": 1000
      },
      "year": {
        "type": "enum",
        "values": [
          "2015",
          "2016",
          "2017",
          "2018",
          "2019"
        ]
      }
    },
    "indexes": [
      {
        "name": "editor-index",
        "key": {
          "editor": 1
        }
      }
    ]
  },
  {
    "database": "local",
    "collection": "inproceedings",
    "count": 50000,
    "content": {
      "key": {
        "type": "stringFromParts",
        "parts": [
          {
            "type": "enum",
            "values": [
              "root1",
              "root2",
              "root3"
            ]
          },
          {
            "type": "constant",
            "constVal": "/"
          },
          {
            "type": "enum",
            "values": [
              "mid1",
              "mid2",
              "mid3"
            ]
          },
          {
            "type": "constant",
            "constVal": "/"
          },
          {
            "type": "enum",
            "values": [
              "end1",
              "end2",
              "end3"
            ]
          }
        ]
      },
      "author": {
        "type": "array",
        "arrayContent": {
          "type": "string",
          "minLength": 1,
          "maxLength": 1
        }
      },
      "title": {
        "type": "string",
        "minLength": 5,
        "maxLength": 10
      },
      "pages": {
        "type": "stringFromParts",
        "parts": [
          {
            "type": "int",
            "min": 200,
            "max": 250
          },
          {
            "type": "constant",
            "constVal": "-"
          },
          {
            "type": "int",
            "min": 251,
            "max": 300
          }
        ]
      },
      "year": {
        "type": "enum",
        "values": [
          "2015",
          "2016",
          "2017",
          "2018",
          "2019"
        ]
      },
      "booktitle": {
        "type": "string",
        "minLength": 5,
        "maxLength": 10
      }
    },
    "indexes": [
      {
        "name": "author-index",
        "key": {
          "author": 1
        }
      }
    ]
  }
]

而且它创建了你所拥有的记录数量(3 M,50 K)。注意作者索引。
运行前面给定的带有explain的查询将显示:

Documents returned: 64
Actual query execution time (ms): 24708
Query used the following indexes: 1 (author-index)

记录如下:

[{
  editorCount: 1643,
  name: "E",
  authorCount: 1201,
  totalCount: 2844
},{
  editorCount: 1616,
  name: "w",
  authorCount: 1200,
  totalCount: 2816
},{
  editorCount: 1589,
  name: "k",
  authorCount: 1217,
  totalCount: 2806
},{...}]

因此,我认为2.5 seconds对于这种类型的查询对于3 M记录来说还不错。
最后一个更新,我在想,你根本不需要连接。
通过在两个集合上设置editorauthor索引,任何小的匹配或过滤都将非常快,即使是3 M条记录。(分别针对编辑和作者),它会一直使用索引。您可以在应用程序代码中对总数求和。类似于:

const { MongoClient } = require('mongodb')
const uri = 'mongodb://localhost:27017/local'

const client = new MongoClient(uri)
const run = async () => {
  try {
    await client.connect()

    const db = client.db('local')
    const proceedings = db.collection('proceeding_collection')
    const inproceedings = db.collection('inproceedings')

    const start = new Date().getTime()

    const distinctEditors = (await proceedings.distinct('editor')).filter(e => e !== undefined)
    const editors = await Promise.all(distinctEditors.map(async (e) => {
      const editorCount = await proceedings.count({ editor: e })
      const authorCount = await inproceedings.count({ author: e })
      const totalCount = editorCount + authorCount
      return { name: e, editorCount, authorCount, totalCount }
    }))

    const elapsed = new Date().getTime() - start

    console.log('editors', editors, editors.length)
    console.log('Took', elapsed, 'ms')
  } finally {
    await client.close()
  }
}
run().catch(console.dir)

对我来说,它以1032ms的速度运行。几乎快了2.5倍。

相关问题