我已经创建了一个基本的例子,自定义Pandas扩展类型存储二维坐标,与源代码如下。
我可以使用pd.array()成功地创建这种类型的数组,它的工作方式与预期的一样:
arr = pd.array([(1.5, 2.0), (156, 21), (-120, 98.5)], dtype='coordinate')
第一个月
但是,当使用该数组初始化一个系列,或直接初始化一个系列并指定'coordinate' dtype时,我得到以下错误:
Cell In [58], line 1
----> 1 pd.Series(coords, dtype='coordinate')
File ~/.local/lib/python3.9/site-packages/pandas/core/series.py:474, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
472 manager = get_option("mode.data_manager")
473 if manager == "block":
--> 474 data = SingleBlockManager.from_array(data, index)
475 elif manager == "array":
476 data = SingleArrayManager.from_array(data, index)
File ~/.local/lib/python3.9/site-packages/pandas/core/internals/managers.py:1912, in SingleBlockManager.from_array(cls, array, index)
1907 @classmethod
1908 def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
1909 """
1910 Constructor for if we have an array that is not yet a Block.
1911 """
-> 1912 block = new_block(array, placement=slice(0, len(index)), ndim=1)
1913 return cls(block, index)
File ~/.local/lib/python3.9/site-packages/pandas/core/internals/blocks.py:2181, in new_block(values, placement, ndim)
2178 klass = get_block_type(values.dtype)
2180 values = maybe_coerce_values(values)
-> 2181 return klass(values, ndim=ndim, placement=placement)
TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got CoordinateArray)
这似乎是一个问题,初始化块来保存数据,但我不知道为什么。扩展类型定义:
import numpy as np
import pandas as pd
from functools import total_ordering
from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.dtypes import PandasExtensionDtype
from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin
@total_ordering
class Coordinate(object):
"""
Simple class to represent a 2D coordinate with X and Y components.
Could extend with more useful methods etc
"""
def __init__(self, x, y):
self.x = float(x)
self.y = float(y)
def __getitem__(self, index):
"""
Allows object to act like (x, y) coordinate pair with indexing
"""
if index == 0:
return self.x
elif index == 1:
return self.y
else:
raise KeyError('Invalid coordinate index: {}'.format(index))
def as_tuple(self):
"""
Return as (x, y) coordinate pair
"""
return (self.x, self.y)
def __len__(self):
return 2
def __repr__(self):
return 'Coordinate({}, {})'.format(self.x, self.y)
# Operator support
def __add__(self, other):
"""
Add scalar value or other coordinate
"""
if isinstance(other, (int, float)):
return Coordinate(self.x + other, self.y + other)
other_coord = create_coordinate(other)
return Coordinate(self.x + other_coord.x, self.y + other_coord.y)
def __sub__(self, other):
"""
Subtract scalar value or other coordinate
"""
if isinstance(other, (int, float)):
return Coordinate(self.x - other, self.y - other)
other_coord = create_coordinate(other)
return Coordinate(self.x - other_coord.x, self.y - other_coord.y)
def __mul__(self, other):
if isinstance(other, (int, float)):
return Coordinate(self.x * other, self.y * other)
else:
raise TypeError('Cannot multiply coordinate by {}'.format(type(other)))
def __neg__(self):
return Coordinate(-self.x, -self.y)
def __eq__(self, other):
other_coord = create_coordinate(other)
return self.x == other_coord.x and self.y == other_coord.y
def __lt__(self, other):
other_coord = create_coordinate(other)
return self.x < other_coord.x and self.y < other_coord.y
def create_coordinate(val):
"""
Factory function for constructing a Coordinate from various
types of inputs
"""
if isinstance(val, Coordinate):
return val
if isinstance(val, (list, tuple)) and len(val) == 2:
# Construct from list-like of X,Y value pair
return Coordinate(val[0], val[1])
raise ValueError('Invalid value to create Coordinate from: {}'.format(val))
@register_extension_dtype
class CoordinateDtype(PandasExtensionDtype):
"""
Class to describe the custom Coordinate data type
"""
type = Coordinate # Scalar type for data
name = 'coordinate' # String identifying the data type (for display)
_metadata = ('name',) # List of attributes to uniquely identify this data type
@classmethod
def construct_array_type(cls):
"""
Return array type associated with this dtype
"""
return CoordinateArray
def __str__(self):
return self.name
class CoordinateArray(ExtensionArray, ExtensionScalarOpsMixin):
"""
Custom Extension Array type for an array of Coordinates
Needs to define:
- Associated Dtype it is used with
- How to construct array from sequence of scalars
- How data is stored and accessed
- Any custom array methods
"""
dtype = CoordinateDtype
def __init__(self, x_values, y_values, copy=False):
"""
Initialise array of coordinates from component X and Y values
(Allows efficient initialisation from existing lists/arrays)
"""
self.x_values = np.array(x_values, dtype=np.float64, copy=copy)
self.y_values = np.array(y_values, dtype=np.float64, copy=copy)
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
# Construct new array from sequence of values (Unzip coordinates into x and y components)
x_values, y_values = zip(*[create_coordinate(val).as_tuple() for val in scalars])
return CoordinateArray(x_values, y_values, copy=copy)
@classmethod
def from_coordinates(cls, coordinates):
"""
Construct array from sequence of values (coordinates)
Can be provided as Coordinate instances or list/tuple like (x, y) pairs
"""
return cls._from_sequence(coordinates)
@classmethod
def _concat_same_type(cls, to_concat):
"""
Concatenate multiple arrays of this dtype
"""
return CoordinateArray(
np.concatenate(arr.x_values for arr in to_concat),
np.concatenate(arr.y_values for arr in to_concat),
)
@property
def nbytes(self):
"""
The number of bytes needed to store this object in memory.
"""
return self.x_values.nbytes + self.y_values.nbytes
def __getitem__(self, item):
"""
Retrieve single item or slice
"""
if isinstance(item, int):
# Get single coordinate
return Coordinate(self.x_values[item], self.y_values[item])
else:
# Get subset from slice or boolean array
return CoordinateArray(self.x_values[item], self.y_values[item])
def __eq__(self, other):
"""
Perform element-wise equality with a given coordinate value
"""
if isinstance(other, (pd.Index, pd.Series, pd.DataFrame)):
return NotImplemented
return (self.x_values == other[0]) & (self.y_values == other[1])
def __len__(self):
return self.x_values.size
def isna(self):
"""
Returns a 1-D array indicating if each value is missing
"""
return np.isnan(self.x_values)
def take(self, indices, *, allow_fill=False, fill_value=None):
"""
Take element from array using boolean index
"""
from pandas.core.algorithms import take
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
x_result = take(self.x_values, indices, fill_value=fill_value, allow_fill=allow_fill)
y_result = take(self.y_values, indices, fill_value=fill_value, allow_fill=allow_fill)
return CoordinateArray(x_result, y_result)
def copy(self):
"""
Return copy of array
"""
return CoordinateArray(np.copy(self.x_values), np.copy(self.y_values))
# Register operator overloads using logic defined in Coordinate class
CoordinateArray._add_arithmetic_ops()
CoordinateArray._add_comparison_ops()
1条答案
按热度按时间d7v8vwbk1#
发现问题,原因是
CoordinateArray
将dtype
属性设置为CoordinateDtype
类,而不是作为返回CoordinateDtype()
示例的属性