swift 用于3D手部跟踪的LiDAR深度+视觉手部跟踪

我想使用Vision 2D手部跟踪输入结合ARKit〉人物遮挡〉深度身体分割（利用LiDAR）来获取索引尖端的3D世界坐标。
我正在执行的步骤：
1-Vision工程提供的指尖2D屏幕位置
2-CVPixelBuffer的深度数据似乎也正确
3-从2D屏幕坐标+深度数据到3D世界坐标的取消投影错误
理想情况下，我可以得到一个类似于Josh Casperz的LiDAR Lab应用程序的结果：

下面是我的代码，处理二维点坐标+深度到三维世界坐标：

// Result from Vision framework
// Coordinates top right of the screen with Y to the left, X down
indexTip = CGPoint(x:(indexTipPoint.location.x) * CGFloat(arView.bounds.width),
                           y:(1 - indexTipPoint.location.y) * CGFloat(arView.bounds.height))
        
if let segmentationBuffer:CVPixelBuffer = frame.estimatedDepthData {

      let segmentationWidth = CVPixelBufferGetWidth(segmentationBuffer)
      let segmentationHeight = CVPixelBufferGetHeight(segmentationBuffer)
            
      let xConverted:CGFloat = indexTip.x * CGFloat(segmentationWidth) / CGFloat(arView.bounds.width)
      let yConverted:CGFloat = indexTip.y * CGFloat(segmentationHeight) / CGFloat(arView.bounds.height)

      if let indexDepth:Float = segmentationBuffer.value(column: Int(xConverted), row: Int(yConverted)) {

           if indexDepth != 0 {
                 let cameraIntrinsics = frame.camera.intrinsics

                 var xrw: Float = (Float(indexTip.x) - cameraIntrinsics[2][0]) * indexDepth
                 xrw = xrw / cameraIntrinsics[0][0]
                 var yrw: Float = (Float(indexTip.y) - cameraIntrinsics[2][1]) * indexDepth
                 yrw = yrw / cameraIntrinsics[1][1]
                 let xyzw: SIMD4<Float> = SIMD4<Float>(xrw, yrw, indexDepth, 1.0)
                 let vecResult = frame.camera.viewMatrix(for: .portrait) * xyzw
                    
                 resultAnchor.setPosition(SIMD3<Float>(vecResult.x, vecResult.y, vecResult.z), relativeTo: nil)
           }
      }
 }

下面是一个视频，它看起来像什么时候运行，似乎总是位于空间中的特定区域：Video
这些计算基本上来自示例代码Displaying a Point Cloud Using Scene Depth

最后这里是完整的zip文件，如果你想尝试一下自己：ZIP.
你知道我的计算出了什么问题吗？

@oscar-falmer是的，我在苹果开发者论坛上写了这个答案，并把它做成了body tracking package。我也试着在这里链接到他们，但有人过来删除了我的答案，理由是"比链接更重要的答案可能会被删除。"但出于某种原因，他们离开了你的答案。这里是复制的解决方案。如果你觉得这个有用，请投赞成票。
Vision结果以Vision坐标表示：标准化，（0，0）左下，（1，1）右上。AVFoundation坐标为（0，0）左下，（1，1）右下。要从Vision坐标转换为AVFoundation坐标，必须按如下方式翻转Y轴：

public extension CGPoint {
    func convertVisionToAVFoundation() -> CGPoint {
        return CGPoint(x: self.x, y: 1 - self.y)
    }
}

该AVFoundation坐标是需要用作索引深度缓冲区的输入的坐标，如下所示：

public extension CVPixelBuffer {

    ///The input point must be in normalized AVFoundation coordinates. i.e. (0,0) is in the Top-Left, (1,1,) in the Bottom-Right.

    func value(from point: CGPoint) -> Float? {

        let width = CVPixelBufferGetWidth(self)

        let height = CVPixelBufferGetHeight(self)

        let colPosition = Int(point.x * CGFloat(width))

        let rowPosition = Int(point.y * CGFloat(height))

        return value(column: colPosition, row: rowPosition)

    }

    func value(column: Int, row: Int) -> Float? {

        guard CVPixelBufferGetPixelFormatType(self) == kCVPixelFormatType_DepthFloat32 else { return nil }

        CVPixelBufferLockBaseAddress(self, .readOnly)

        if let baseAddress = CVPixelBufferGetBaseAddress(self) {

            let width = CVPixelBufferGetWidth(self)

            let index = column + (row * width)

            let offset = index * MemoryLayout<Float>.stride

            let value = baseAddress.load(fromByteOffset: offset, as: Float.self)

                CVPixelBufferUnlockBaseAddress(self, .readOnly)

            return value

        }
        CVPixelBufferUnlockBaseAddress(self, .readOnly)

        return nil
    }
}

这就是从Vision请求中获取给定位置的深度所需的全部内容。
如果您想找到屏幕上的位置，以便与UIKit或ARView.ray(through:)之类的东西一起使用，则需要进一步转换。Vision请求是在arView.session.currentFrame.capturedImage上执行的。从ARFrame.displayTransform(for:viewportSize:)的文档中：
归一化图像坐标范围为图像左上角的（0，0）到（1，1）。此方法创建一个仿射变换，表示将相机图像调整到指定方向和指定视口的纵横比所需的旋转和纵横匹配裁剪操作。仿射变换不会缩放到视口的像素大小。capturedImage像素缓冲区是设备摄像头捕获的原始图像，因此不针对设备方向或视图纵横比进行调整。
因此，屏幕上渲染的图像是摄像头捕获的帧的裁剪版本，需要从AVFoundation坐标转换为显示（UIKit）坐标。从AVFoundation坐标转换为显示（UIKit）坐标：

public extension ARView {

      func convertAVFoundationToScreenSpace(_ point: CGPoint) -> CGPoint? {

        //Convert from normalized AVFoundation coordinates (0,0 top-left, 1,1 bottom-right)

        //to screen-space coordinates.

        guard

            let arFrame = session.currentFrame,

            let interfaceOrientation = window?.windowScene?.interfaceOrientation

        else {return nil}

            let transform = arFrame.displayTransform(for: interfaceOrientation, viewportSize: frame.size)

            let normalizedCenter = point.applying(transform)

            let center = normalizedCenter.applying(CGAffineTransform.identity.scaledBy(x: frame.width, y: frame.height))

            return center
    }
}

要从UIKit显示坐标到AVFoundation坐标执行相反的方向：

public extension ARView {

    func convertScreenSpaceToAVFoundation(_ point: CGPoint) -> CGPoint? {

        //Convert to normalized pixel coordinates (0,0 top-left, 1,1 bottom-right)

        //from screen-space UIKit coordinates.

        guard

          let arFrame = session.currentFrame,

          let interfaceOrientation = window?.windowScene?.interfaceOrientation

        else {return nil}

          let inverseScaleTransform = CGAffineTransform.identity.scaledBy(x: frame.width, y: frame.height).inverted()

          let invertedDisplayTransform = arFrame.displayTransform(for: interfaceOrientation, viewportSize: frame.size).inverted()

          let unScaledPoint = point.applying(inverseScaleTransform)

          let normalizedCenter = unScaledPoint.applying(invertedDisplayTransform)

          return normalizedCenter
    }
}

要从UIKit屏幕坐标和相应的深度值获取世界空间坐标：

/// Get the world-space position from a UIKit screen point and a depth value
    /// - Parameters:
    ///   - screenPosition: A CGPoint representing a point on screen in UIKit coordinates.
    ///   - depth: The depth at this coordinate, in meters.
    /// - Returns: The position in world space of this coordinate at this depth.
    private func worldPosition(screenPosition: CGPoint, depth: Float) -> simd_float3? {

        guard

            let rayResult = arView.ray(through: screenPosition)

        else {return nil}

        //rayResult.direction is a normalized (1 meter long) vector pointing in the correct direction, and we want to go the length of depth along this vector.

         let worldOffset = rayResult.direction * depth

         let worldPosition = rayResult.origin + worldOffset

         return worldPosition
    }

要为屏幕上的给定点设置实体在世界空间中的位置：

let currentFrame = arView.session.currentFrame,

    let sceneDepth = (currentFrame.smoothedSceneDepth ?? currentFrame.sceneDepth)?.depthMap

    let depthAtPoint = sceneDepth.value(from: avFoundationPosition),

    let worldPosition = worldPosition(screenPosition: uiKitPosition, depth: depthAtPoint)

    trackedEntity.setPosition(worldPosition, relativeTo: nil)

不要忘记在ARConfiguration上设置正确的frameSemantics：

func runNewConfig(){

        // Create a session configuration
        let configuration = ARWorldTrackingConfiguration()

        //Goes with (currentFrame.smoothedSceneDepth ?? currentFrame.sceneDepth)?.depthMap
        let frameSemantics: ARConfiguration.FrameSemantics = [.smoothedSceneDepth, .sceneDepth]

        //Goes with currentFrame.estimatedDepthData
        //let frameSemantics: ARConfiguration.FrameSemantics = .personSegmentationWithDepth

        if ARWorldTrackingConfiguration.supportsFrameSemantics(frameSemantics) {
            configuration.frameSemantics.insert(frameSemantics)
        }

        // Run the view's session

        session.run(configuration)
    }

swift 用于3D手部跟踪的LiDAR深度+视觉手部跟踪

2条答案

相关问题

热门标签

最新问答