pandas 从自定义ExtensionArray创建系列时出现TypeError

wztqucjr  于 2023-01-01  发布在  其他
关注(0)|答案(1)|浏览(303)

我已经创建了一个基本的例子,自定义Pandas扩展类型存储二维坐标,与源代码如下。
我可以使用pd.array()成功地创建这种类型的数组,它的工作方式与预期的一样:

arr = pd.array([(1.5, 2.0), (156, 21), (-120, 98.5)], dtype='coordinate')

第一个月
但是,当使用该数组初始化一个系列,或直接初始化一个系列并指定'coordinate' dtype时,我得到以下错误:

Cell In [58], line 1
----> 1 pd.Series(coords, dtype='coordinate')

File ~/.local/lib/python3.9/site-packages/pandas/core/series.py:474, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    472 manager = get_option("mode.data_manager")
    473 if manager == "block":
--> 474     data = SingleBlockManager.from_array(data, index)
    475 elif manager == "array":
    476     data = SingleArrayManager.from_array(data, index)

File ~/.local/lib/python3.9/site-packages/pandas/core/internals/managers.py:1912, in SingleBlockManager.from_array(cls, array, index)
   1907 @classmethod
   1908 def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
   1909     """
   1910     Constructor for if we have an array that is not yet a Block.
   1911     """
-> 1912     block = new_block(array, placement=slice(0, len(index)), ndim=1)
   1913     return cls(block, index)

File ~/.local/lib/python3.9/site-packages/pandas/core/internals/blocks.py:2181, in new_block(values, placement, ndim)
   2178 klass = get_block_type(values.dtype)
   2180 values = maybe_coerce_values(values)
-> 2181 return klass(values, ndim=ndim, placement=placement)

TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got CoordinateArray)

这似乎是一个问题,初始化块来保存数据,但我不知道为什么。扩展类型定义:

import numpy as np
import pandas as pd
from functools import total_ordering
from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.dtypes import PandasExtensionDtype
from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin

@total_ordering
class Coordinate(object):
    """
    Simple class to represent a 2D coordinate with X and Y components.
    Could extend with more useful methods etc
    """
    def __init__(self, x, y):
        self.x = float(x)
        self.y = float(y)

    def __getitem__(self, index):
        """
        Allows object to act like (x, y) coordinate pair with indexing
        """
        if index == 0:
            return self.x
        elif index == 1:
            return self.y
        else:
            raise KeyError('Invalid coordinate index: {}'.format(index))

    def as_tuple(self):
        """
        Return as (x, y) coordinate pair
        """
        return (self.x, self.y)

    def __len__(self):
        return 2

    def __repr__(self):
        return 'Coordinate({}, {})'.format(self.x, self.y)

    # Operator support
    def __add__(self, other):
        """
        Add scalar value or other coordinate
        """
        if isinstance(other, (int, float)):
            return Coordinate(self.x + other, self.y + other)

        other_coord = create_coordinate(other)
        return Coordinate(self.x + other_coord.x, self.y + other_coord.y)
    
    def __sub__(self, other):
        """
        Subtract scalar value or other coordinate
        """
        if isinstance(other, (int, float)):
            return Coordinate(self.x - other, self.y - other)

        other_coord = create_coordinate(other)
        return Coordinate(self.x - other_coord.x, self.y - other_coord.y)

    def __mul__(self, other):
        if isinstance(other, (int, float)):
            return Coordinate(self.x * other, self.y * other)
        else:
            raise TypeError('Cannot multiply coordinate by {}'.format(type(other)))

    def __neg__(self):
        return Coordinate(-self.x, -self.y)

    def __eq__(self, other):
        other_coord = create_coordinate(other)
        return self.x == other_coord.x and self.y == other_coord.y

    def __lt__(self, other):
        other_coord = create_coordinate(other)
        return self.x < other_coord.x and self.y < other_coord.y 



def create_coordinate(val):
    """
    Factory function for constructing a Coordinate from various
    types of inputs
    """
    if isinstance(val, Coordinate):
        return val

    if isinstance(val, (list, tuple)) and len(val) == 2:
        # Construct from list-like of X,Y value pair
        return Coordinate(val[0], val[1])

    raise ValueError('Invalid value to create Coordinate from: {}'.format(val))
    

@register_extension_dtype
class CoordinateDtype(PandasExtensionDtype):
    """
    Class to describe the custom Coordinate data type
    """
    type = Coordinate       # Scalar type for data
    name = 'coordinate'     # String identifying the data type (for display)
    _metadata = ('name',)   # List of attributes to uniquely identify this data type

    @classmethod
    def construct_array_type(cls):
        """
        Return array type associated with this dtype
        """
        return CoordinateArray

    def __str__(self):
        return self.name

class CoordinateArray(ExtensionArray, ExtensionScalarOpsMixin):
    """
    Custom Extension Array type for an array of Coordinates
    Needs to define:
    - Associated Dtype it is used with
    - How to construct array from sequence of scalars
    - How data is stored and accessed
    - Any custom array methods
    """
    dtype = CoordinateDtype

    def __init__(self, x_values, y_values, copy=False):
        """
        Initialise array of coordinates from component X and Y values 
        (Allows efficient initialisation from existing lists/arrays)
        """
        self.x_values = np.array(x_values, dtype=np.float64, copy=copy)
        self.y_values = np.array(y_values, dtype=np.float64, copy=copy)

    @classmethod
    def _from_sequence(cls, scalars, *, dtype=None, copy=False):
        # Construct new array from sequence of values (Unzip coordinates into x and y components)
        x_values, y_values = zip(*[create_coordinate(val).as_tuple() for val in scalars])
        return CoordinateArray(x_values, y_values, copy=copy)

    @classmethod
    def from_coordinates(cls, coordinates):
        """
        Construct array from sequence of values (coordinates)
        Can be provided as Coordinate instances or list/tuple like (x, y) pairs
        """
        return cls._from_sequence(coordinates)

    @classmethod
    def _concat_same_type(cls, to_concat):
        """
        Concatenate multiple arrays of this dtype
        """
        return CoordinateArray(
            np.concatenate(arr.x_values for arr in to_concat),
            np.concatenate(arr.y_values for arr in to_concat),
        )

    @property
    def nbytes(self):
        """
        The number of bytes needed to store this object in memory.
        """
        return self.x_values.nbytes + self.y_values.nbytes

    def __getitem__(self, item):
        """
        Retrieve single item or slice
        """
        if isinstance(item, int):
            # Get single coordinate
            return Coordinate(self.x_values[item], self.y_values[item])

        else:
            # Get subset from slice  or boolean array
            return CoordinateArray(self.x_values[item], self.y_values[item])

    def __eq__(self, other):
        """
        Perform element-wise equality with a given coordinate value
        """
        if isinstance(other, (pd.Index, pd.Series, pd.DataFrame)):
            return NotImplemented

        return (self.x_values == other[0]) & (self.y_values == other[1])

    def __len__(self):
        return self.x_values.size

    def isna(self):
        """
        Returns a 1-D array indicating if each value is missing
        """
        return np.isnan(self.x_values)

    def take(self, indices, *, allow_fill=False, fill_value=None):
        """
        Take element from array using boolean index

        """
        from pandas.core.algorithms import take
        if allow_fill and fill_value is None:
            fill_value = self.dtype.na_value

        x_result = take(self.x_values, indices, fill_value=fill_value, allow_fill=allow_fill)
        y_result = take(self.y_values, indices, fill_value=fill_value, allow_fill=allow_fill)
        return CoordinateArray(x_result, y_result)

    def copy(self):
        """
        Return copy of array
        """
        return CoordinateArray(np.copy(self.x_values), np.copy(self.y_values))

# Register operator overloads using logic defined in Coordinate class
CoordinateArray._add_arithmetic_ops()
CoordinateArray._add_comparison_ops()
d7v8vwbk

d7v8vwbk1#

发现问题,原因是CoordinateArraydtype属性设置为CoordinateDtype类,而不是作为返回CoordinateDtype()示例的属性

相关问题