GITENV file updated 14

parent 1d557f94
__all__ = [
"NaT",
"NaTType",
"OutOfBoundsDatetime",
"Period",
"Timedelta",
"Timestamp",
"iNaT",
"Interval",
]
from pandas._libs.interval import Interval
from pandas._libs.tslibs import (
NaT,
NaTType,
OutOfBoundsDatetime,
Period,
Timedelta,
Timestamp,
iNaT,
)
from pandas._libs.util cimport numeric
cdef numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil
This diff is collapsed.
This diff is collapsed.
"""
Template for each `dtype` helper function using 1-d template
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""
# ----------------------------------------------------------------------
# ensure_dtype
# ----------------------------------------------------------------------
cdef int PLATFORM_INT = (<ndarray>np.arange(0, dtype=np.intp)).descr.type_num
def ensure_platform_int(object arr):
# GH3033, GH1392
# platform int is the size of the int pointer, e.g. np.intp
if util.is_array(arr):
if (<ndarray>arr).descr.type_num == PLATFORM_INT:
return arr
else:
# equiv: arr.astype(np.intp)
return cnp.PyArray_Cast(<ndarray>arr, PLATFORM_INT)
else:
return np.array(arr, dtype=np.intp)
def ensure_object(object arr):
if util.is_array(arr):
if (<ndarray>arr).descr.type_num == NPY_OBJECT:
return arr
else:
# equiv: arr.astype(object)
return cnp.PyArray_Cast(<ndarray>arr, NPY_OBJECT)
else:
return np.array(arr, dtype=np.object_)
{{py:
# name, c_type, dtype
dtypes = [('float64', 'FLOAT64', 'float64'),
('float32', 'FLOAT32', 'float32'),
('int8', 'INT8', 'int8'),
('int16', 'INT16', 'int16'),
('int32', 'INT32', 'int32'),
('int64', 'INT64', 'int64'),
('uint8', 'UINT8', 'uint8'),
('uint16', 'UINT16', 'uint16'),
('uint32', 'UINT32', 'uint32'),
('uint64', 'UINT64', 'uint64'),
# ('platform_int', 'INT', 'int_'),
# ('object', 'OBJECT', 'object_'),
]
def get_dispatch(dtypes):
for name, c_type, dtype in dtypes:
yield name, c_type, dtype
}}
{{for name, c_type, dtype in get_dispatch(dtypes)}}
def ensure_{{name}}(object arr, copy=True):
if util.is_array(arr):
if (<ndarray>arr).descr.type_num == NPY_{{c_type}}:
return arr
else:
return arr.astype(np.{{dtype}}, copy=copy)
else:
return np.array(arr, dtype=np.{{dtype}})
{{endfor}}
"""
Template for each `dtype` helper function for take
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""
# ----------------------------------------------------------------------
# take_1d, take_2d
# ----------------------------------------------------------------------
{{py:
# c_type_in, c_type_out
dtypes = [
('uint8_t', 'uint8_t'),
('uint8_t', 'object'),
('int8_t', 'int8_t'),
('int8_t', 'int32_t'),
('int8_t', 'int64_t'),
('int8_t', 'float64_t'),
('int16_t', 'int16_t'),
('int16_t', 'int32_t'),
('int16_t', 'int64_t'),
('int16_t', 'float64_t'),
('int32_t', 'int32_t'),
('int32_t', 'int64_t'),
('int32_t', 'float64_t'),
('int64_t', 'int64_t'),
('int64_t', 'float64_t'),
('float32_t', 'float32_t'),
('float32_t', 'float64_t'),
('float64_t', 'float64_t'),
('object', 'object'),
]
def get_dispatch(dtypes):
for (c_type_in, c_type_out) in dtypes:
def get_name(dtype_name):
if dtype_name == "object":
return "object"
if dtype_name == "uint8_t":
return "bool"
return dtype_name[:-2]
name = get_name(c_type_in)
dest = get_name(c_type_out)
args = dict(name=name, dest=dest, c_type_in=c_type_in,
c_type_out=c_type_out)
yield (name, dest, c_type_in, c_type_out)
}}
{{for name, dest, c_type_in, c_type_out in get_dispatch(dtypes)}}
@cython.wraparound(False)
@cython.boundscheck(False)
{{if c_type_in != "object"}}
def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values,
{{else}}
def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values,
{{endif}}
const intp_t[:] indexer,
{{c_type_out}}[:] out,
fill_value=np.nan):
cdef:
Py_ssize_t i, n, idx
{{c_type_out}} fv
n = indexer.shape[0]
fv = fill_value
{{if c_type_out != "object"}}
with nogil:
{{else}}
if True:
{{endif}}
for i in range(n):
idx = indexer[i]
if idx == -1:
out[i] = fv
else:
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
out[i] = True if values[idx] > 0 else False
{{else}}
out[i] = values[idx]
{{endif}}
@cython.wraparound(False)
@cython.boundscheck(False)
{{if c_type_in != "object"}}
def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
{{else}}
def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
{{endif}}
ndarray[intp_t] indexer,
{{c_type_out}}[:, :] out,
fill_value=np.nan):
cdef:
Py_ssize_t i, j, k, n, idx
{{c_type_out}} fv
n = len(indexer)
k = values.shape[1]
fv = fill_value
IF {{True if c_type_in == c_type_out != "object" else False}}:
cdef:
const {{c_type_out}} *v
{{c_type_out}} *o
# GH#3130
if (values.strides[1] == out.strides[1] and
values.strides[1] == sizeof({{c_type_out}}) and
sizeof({{c_type_out}}) * n >= 256):
for i in range(n):
idx = indexer[i]
if idx == -1:
for j in range(k):
out[i, j] = fv
else:
v = &values[idx, 0]
o = &out[i, 0]
memmove(o, v, <size_t>(sizeof({{c_type_out}}) * k))
return
for i in range(n):
idx = indexer[i]
if idx == -1:
for j in range(k):
out[i, j] = fv
else:
for j in range(k):
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
out[i, j] = True if values[idx, j] > 0 else False
{{else}}
out[i, j] = values[idx, j]
{{endif}}
@cython.wraparound(False)
@cython.boundscheck(False)
{{if c_type_in != "object"}}
def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
{{else}}
def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
{{endif}}
ndarray[intp_t] indexer,
{{c_type_out}}[:, :] out,
fill_value=np.nan):
cdef:
Py_ssize_t i, j, k, n, idx
{{c_type_out}} fv
n = len(values)
k = len(indexer)
if n == 0 or k == 0:
return
fv = fill_value
for i in range(n):
for j in range(k):
idx = indexer[j]
if idx == -1:
out[i, j] = fv
else:
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
out[i, j] = True if values[i, idx] > 0 else False
{{else}}
out[i, j] = values[i, idx]
{{endif}}
@cython.wraparound(False)
@cython.boundscheck(False)
def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
indexer,
ndarray[{{c_type_out}}, ndim=2] out,
fill_value=np.nan):
cdef:
Py_ssize_t i, j, k, n, idx
ndarray[intp_t] idx0 = indexer[0]
ndarray[intp_t] idx1 = indexer[1]
{{c_type_out}} fv
n = len(idx0)
k = len(idx1)
fv = fill_value
for i in range(n):
idx = idx0[i]
if idx == -1:
for j in range(k):
out[i, j] = fv
else:
for j in range(k):
if idx1[j] == -1:
out[i, j] = fv
else:
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
out[i, j] = True if values[idx, idx1[j]] > 0 else False
{{else}}
out[i, j] = values[idx, idx1[j]]
{{endif}}
{{endfor}}
# ----------------------------------------------------------------------
# take_2d internal function
# ----------------------------------------------------------------------
ctypedef fused take_t:
float64_t
uint64_t
int64_t
object
cdef _take_2d(ndarray[take_t, ndim=2] values, ndarray[intp_t, ndim=2] idx):
cdef:
Py_ssize_t i, j, N, K
ndarray[intp_t, ndim=2, cast=True] indexer = idx
ndarray[take_t, ndim=2] result
N, K = (<object>values).shape
if take_t is object:
# evaluated at compile-time
result = values.copy()
else:
result = np.empty_like(values)
for i in range(N):
for j in range(K):
result[i, j] = values[i, indexer[i, j]]
return result
from numpy cimport ndarray
cdef class NDArrayBacked:
cdef:
readonly ndarray _ndarray
readonly object _dtype
cpdef NDArrayBacked _from_backing_data(self, ndarray values)
cpdef __setstate__(self, state)
from typing import Sequence
import numpy as np
from pandas._typing import (
DtypeObj,
Shape,
)
class NDArrayBacked:
_dtype: DtypeObj
_ndarray: np.ndarray
def __init__(self, values: np.ndarray, dtype: DtypeObj): ...
@classmethod
def _simple_new(cls, values: np.ndarray, dtype: DtypeObj): ...
def _from_backing_data(self, values: np.ndarray): ...
def __setstate__(self, state): ...
def __len__(self) -> int: ...
@property
def shape(self) -> Shape: ...
@property
def ndim(self) -> int: ...
@property
def size(self) -> int: ...
@property
def nbytes(self) -> int: ...
def copy(self): ...
def delete(self, loc, axis=0): ...
def swapaxes(self, axis1, axis2): ...
def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ...
def reshape(self, *args, **kwargs): ...
def ravel(self, order="C"): ...
@property
def T(self): ...
"""
Cython implementations for internal ExtensionArrays.
"""
cimport cython
import numpy as np
cimport numpy as cnp
from numpy cimport ndarray
cnp.import_array()
@cython.freelist(16)
cdef class NDArrayBacked:
"""
Implementing these methods in cython improves performance quite a bit.
import pandas as pd
from pandas._libs.arrays import NDArrayBacked as cls
dti = pd.date_range("2016-01-01", periods=3)
dta = dti._data
arr = dta._ndarray
obj = cls._simple_new(arr, arr.dtype)
# for foo in [arr, dta, obj]: ...
%timeit foo.copy()
299 ns ± 30 ns per loop # <-- arr underlying ndarray (for reference)
530 ns ± 9.24 ns per loop # <-- dta with cython NDArrayBacked
1.66 µs ± 46.3 ns per loop # <-- dta without cython NDArrayBacked
328 ns ± 5.29 ns per loop # <-- obj with NDArrayBacked.__cinit__
371 ns ± 6.97 ns per loop # <-- obj with NDArrayBacked._simple_new
%timeit foo.T
125 ns ± 6.27 ns per loop # <-- arr underlying ndarray (for reference)
226 ns ± 7.66 ns per loop # <-- dta with cython NDArrayBacked
911 ns ± 16.6 ns per loop # <-- dta without cython NDArrayBacked
215 ns ± 4.54 ns per loop # <-- obj with NDArrayBacked._simple_new
"""
# TODO: implement take in terms of cnp.PyArray_TakeFrom
# TODO: implement concat_same_type in terms of cnp.PyArray_Concatenate
# cdef:
# readonly ndarray _ndarray
# readonly object _dtype
def __init__(self, ndarray values, object dtype):
self._ndarray = values
self._dtype = dtype
@classmethod
def _simple_new(cls, ndarray values, object dtype):
cdef:
NDArrayBacked obj
obj = NDArrayBacked.__new__(cls)
obj._ndarray = values
obj._dtype = dtype
return obj
cpdef NDArrayBacked _from_backing_data(self, ndarray values):
"""
Construct a new ExtensionArray `new_array` with `arr` as its _ndarray.
This should round-trip:
self == self._from_backing_data(self._ndarray)
"""
# TODO: re-reuse simple_new if/when it can be cpdef
cdef:
NDArrayBacked obj
obj = NDArrayBacked.__new__(type(self))
obj._ndarray = values
obj._dtype = self._dtype
return obj
cpdef __setstate__(self, state):
if isinstance(state, dict):
if "_data" in state:
data = state.pop("_data")
elif "_ndarray" in state:
data = state.pop("_ndarray")
else:
raise ValueError
self._ndarray = data
self._dtype = state.pop("_dtype")
for key, val in state.items():
setattr(self, key, val)
elif isinstance(state, tuple):
if len(state) != 3:
if len(state) == 1 and isinstance(state[0], dict):
self.__setstate__(state[0])
return
raise NotImplementedError(state)
data, dtype = state[:2]
if isinstance(dtype, np.ndarray):
dtype, data = data, dtype
self._ndarray = data
self._dtype = dtype
if isinstance(state[2], dict):
for key, val in state[2].items():
setattr(self, key, val)
else:
raise NotImplementedError(state)
else:
raise NotImplementedError(state)
def __len__(self) -> int:
return len(self._ndarray)
@property
def shape(self):
# object cast bc _ndarray.shape is npy_intp*
return (<object>(self._ndarray)).shape
@property
def ndim(self) -> int:
return self._ndarray.ndim
@property
def size(self) -> int:
return self._ndarray.size
@property
def nbytes(self) -> int:
return self._ndarray.nbytes
def copy(self):
# NPY_ANYORDER -> same order as self._ndarray
res_values = cnp.PyArray_NewCopy(self._ndarray, cnp.NPY_ANYORDER)
return self._from_backing_data(res_values)
def delete(self, loc, axis=0):
res_values = np.delete(self._ndarray, loc, axis=axis)
return self._from_backing_data(res_values)
def swapaxes(self, axis1, axis2):
res_values = cnp.PyArray_SwapAxes(self._ndarray, axis1, axis2)
return self._from_backing_data(res_values)
# TODO: pass NPY_MAXDIMS equiv to axis=None?
def repeat(self, repeats, axis: int = 0):
if axis is None:
axis = 0
res_values = cnp.PyArray_Repeat(self._ndarray, repeats, <int>axis)
return self._from_backing_data(res_values)
def reshape(self, *args, **kwargs):
res_values = self._ndarray.reshape(*args, **kwargs)
return self._from_backing_data(res_values)
def ravel(self, order="C"):
# cnp.PyArray_OrderConverter(PyObject* obj, NPY_ORDER* order)
# res_values = cnp.PyArray_Ravel(self._ndarray, order)
res_values = self._ndarray.ravel(order)
return self._from_backing_data(res_values)
@property
def T(self):
res_values = self._ndarray.T
return self._from_backing_data(res_values)
from typing import Literal
import numpy as np
def group_median_float64(
out: np.ndarray, # ndarray[float64_t, ndim=2]
counts: np.ndarray, # ndarray[int64_t]
values: np.ndarray, # ndarray[float64_t, ndim=2]
labels: np.ndarray, # ndarray[int64_t]
min_count: int = ..., # Py_ssize_t
) -> None: ...
def group_cumprod_float64(
out: np.ndarray, # float64_t[:, ::1]
values: np.ndarray, # const float64_t[:, :]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
is_datetimelike: bool,
skipna: bool = ...,
) -> None: ...
def group_cumsum(
out: np.ndarray, # numeric[:, ::1]
values: np.ndarray, # ndarray[numeric, ndim=2]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
is_datetimelike: bool,
skipna: bool = ...,
) -> None: ...
def group_shift_indexer(
out: np.ndarray, # int64_t[::1]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
periods: int,
) -> None: ...
def group_fillna_indexer(
out: np.ndarray, # ndarray[int64_t]
labels: np.ndarray, # ndarray[int64_t]
mask: np.ndarray, # ndarray[uint8_t]
direction: Literal["ffill", "bfill"],
limit: int, # int64_t
dropna: bool,
) -> None: ...
def group_any_all(
out: np.ndarray, # uint8_t[::1]
values: np.ndarray, # const uint8_t[::1]
labels: np.ndarray, # const int64_t[:]
mask: np.ndarray, # const uint8_t[::1]
val_test: Literal["any", "all"],
skipna: bool,
) -> None: ...
def group_add(
out: np.ndarray, # complexfloating_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[complexfloating_t, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ...,
) -> None: ...
def group_prod(
out: np.ndarray, # floating[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[floating, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ...,
) -> None: ...
def group_var(
out: np.ndarray, # floating[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[floating, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ..., # Py_ssize_t
ddof: int = ..., # int64_t
) -> None: ...
def group_mean(
out: np.ndarray, # floating[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[floating, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ...,
) -> None: ...
def group_ohlc(
out: np.ndarray, # floating[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[floating, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ...,
) -> None: ...
def group_quantile(
out: np.ndarray, # ndarray[float64_t]
values: np.ndarray, # ndarray[numeric, ndim=1]
labels: np.ndarray, # ndarray[int64_t]
mask: np.ndarray, # ndarray[uint8_t]
q: float, # float64_t
interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"],
) -> None: ...
def group_last(
out: np.ndarray, # rank_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[rank_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ..., # Py_ssize_t
) -> None: ...
def group_nth(
out: np.ndarray, # rank_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[rank_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ..., # int64_t
rank: int = ..., # int64_t
) -> None: ...
def group_rank(
out: np.ndarray, # float64_t[:, ::1]
values: np.ndarray, # ndarray[rank_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
is_datetimelike: bool,
ties_method: Literal["aveage", "min", "max", "first", "dense"] = ...,
ascending: bool = ...,
pct: bool = ...,
na_option: Literal["keep", "top", "bottom"] = ...,
) -> None: ...
def group_max(
out: np.ndarray, # groupby_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ...,
) -> None: ...
def group_min(
out: np.ndarray, # groupby_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ...,
) -> None: ...
def group_cummin(
out: np.ndarray, # groupby_t[:, ::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
is_datetimelike: bool,
) -> None: ...
def group_cummax(
out: np.ndarray, # groupby_t[:, ::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
is_datetimelike: bool,
) -> None: ...
This diff is collapsed.
import numpy as np
def hash_object_array(
arr: np.ndarray, # np.ndarray[object]
key: str,
encoding: str = ...,
) -> np.ndarray: ... # np.ndarray[np.uint64]
# Translated from the reference implementation
# at https://github.com/veorq/SipHash
import cython
from libc.stdlib cimport (
free,
malloc,
)
import numpy as np
from numpy cimport (
import_array,
ndarray,
uint8_t,
uint32_t,
uint64_t,
)
import_array()
from pandas._libs.util cimport is_nan
DEF cROUNDS = 2
DEF dROUNDS = 4
@cython.boundscheck(False)
def hash_object_array(
ndarray[object] arr, str key, str encoding="utf8"
) -> np.ndarray[np.uint64]:
"""
Parameters
----------
arr : 1-d object ndarray of objects
key : hash key, must be 16 byte len encoded
encoding : encoding for key & arr, default to 'utf8'
Returns
-------
1-d uint64 ndarray of hashes.
Raises
------
TypeError
If the array contains mixed types.
Notes
-----
Allowed values must be strings, or nulls
mixed array types will raise TypeError.
"""
cdef:
Py_ssize_t i, l, n
uint64_t[:] result
bytes data, k
uint8_t *kb
uint64_t *lens
char **vecs
char *cdata
object val
list datas = []
k = <bytes>key.encode(encoding)
kb = <uint8_t *>k
if len(k) != 16:
raise ValueError(
f"key should be a 16-byte string encoded, got {k} (len {len(k)})"
)
n = len(arr)
# create an array of bytes
vecs = <char **>malloc(n * sizeof(char *))
lens = <uint64_t*>malloc(n * sizeof(uint64_t))
for i in range(n):
val = arr[i]
if isinstance(val, bytes):
data = <bytes>val
elif isinstance(val, str):
data = <bytes>val.encode(encoding)
elif val is None or is_nan(val):
# null, stringify and encode
data = <bytes>str(val).encode(encoding)
elif isinstance(val, tuple):
# GH#28969 we could have a tuple, but need to ensure that
# the tuple entries are themselves hashable before converting
# to str
hash(val)
data = <bytes>str(val).encode(encoding)
else:
raise TypeError(
f"{val} of type {type(val)} is not a valid type for hashing, "
"must be string or null"
)
l = len(data)
lens[i] = l
cdata = data
# keep the references alive through the end of the
# function
datas.append(data)
vecs[i] = cdata
result = np.empty(n, dtype=np.uint64)
with nogil:
for i in range(n):
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
free(vecs)
free(lens)
return result.base # .base to retrieve underlying np.ndarray
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
return (x << b) | (x >> (64 - b))
cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
p[0] = <uint8_t>(v)
p[1] = <uint8_t>(v >> 8)
p[2] = <uint8_t>(v >> 16)
p[3] = <uint8_t>(v >> 24)
cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
return (<uint64_t>p[0] |
<uint64_t>p[1] << 8 |
<uint64_t>p[2] << 16 |
<uint64_t>p[3] << 24 |
<uint64_t>p[4] << 32 |
<uint64_t>p[5] << 40 |
<uint64_t>p[6] << 48 |
<uint64_t>p[7] << 56)
cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
uint64_t* v2, uint64_t* v3) nogil:
v0[0] += v1[0]
v1[0] = _rotl(v1[0], 13)
v1[0] ^= v0[0]
v0[0] = _rotl(v0[0], 32)
v2[0] += v3[0]
v3[0] = _rotl(v3[0], 16)
v3[0] ^= v2[0]
v0[0] += v3[0]
v3[0] = _rotl(v3[0], 21)
v3[0] ^= v0[0]
v2[0] += v1[0]
v1[0] = _rotl(v1[0], 17)
v1[0] ^= v2[0]
v2[0] = _rotl(v2[0], 32)
@cython.cdivision(True)
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
uint8_t* key) nogil:
cdef uint64_t v0 = 0x736f6d6570736575ULL
cdef uint64_t v1 = 0x646f72616e646f6dULL
cdef uint64_t v2 = 0x6c7967656e657261ULL
cdef uint64_t v3 = 0x7465646279746573ULL
cdef uint64_t b
cdef uint64_t k0 = u8to64_le(key)
cdef uint64_t k1 = u8to64_le(key + 8)
cdef uint64_t m
cdef int i
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
cdef int left = datalen & 7
cdef int left_byte
b = (<uint64_t>datalen) << 56
v3 ^= k1
v2 ^= k0
v1 ^= k1
v0 ^= k0
while (data != end):
m = u8to64_le(data)
v3 ^= m
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= m
data += sizeof(uint64_t)
for i in range(left-1, -1, -1):
b |= (<uint64_t>data[i]) << (i * 8)
v3 ^= b
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= b
v2 ^= 0xff
for i in range(dROUNDS):
_sipround(&v0, &v1, &v2, &v3)
b = v0 ^ v1 ^ v2 ^ v3
return b
from numpy cimport (
intp_t,
ndarray,
)
from pandas._libs.khash cimport (
complex64_t,
complex128_t,
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
kh_complex64_t,
kh_complex128_t,
kh_float32_t,
kh_float64_t,
kh_int8_t,
kh_int16_t,
kh_int32_t,
kh_int64_t,
kh_pymap_t,
kh_str_t,
kh_uint8_t,
kh_uint16_t,
kh_uint32_t,
kh_uint64_t,
khcomplex64_t,
khcomplex128_t,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)
# prototypes for sharing
cdef class HashTable:
pass
cdef class UInt64HashTable(HashTable):
cdef kh_uint64_t *table
cpdef get_item(self, uint64_t val)
cpdef set_item(self, uint64_t key, Py_ssize_t val)
cdef class Int64HashTable(HashTable):
cdef kh_int64_t *table
cpdef get_item(self, int64_t val)
cpdef set_item(self, int64_t key, Py_ssize_t val)
cdef class UInt32HashTable(HashTable):
cdef kh_uint32_t *table
cpdef get_item(self, uint32_t val)
cpdef set_item(self, uint32_t key, Py_ssize_t val)
cdef class Int32HashTable(HashTable):
cdef kh_int32_t *table
cpdef get_item(self, int32_t val)
cpdef set_item(self, int32_t key, Py_ssize_t val)
cdef class UInt16HashTable(HashTable):
cdef kh_uint16_t *table
cpdef get_item(self, uint16_t val)
cpdef set_item(self, uint16_t key, Py_ssize_t val)
cdef class Int16HashTable(HashTable):
cdef kh_int16_t *table
cpdef get_item(self, int16_t val)
cpdef set_item(self, int16_t key, Py_ssize_t val)
cdef class UInt8HashTable(HashTable):
cdef kh_uint8_t *table
cpdef get_item(self, uint8_t val)
cpdef set_item(self, uint8_t key, Py_ssize_t val)
cdef class Int8HashTable(HashTable):
cdef kh_int8_t *table
cpdef get_item(self, int8_t val)
cpdef set_item(self, int8_t key, Py_ssize_t val)
cdef class Float64HashTable(HashTable):
cdef kh_float64_t *table
cpdef get_item(self, float64_t val)
cpdef set_item(self, float64_t key, Py_ssize_t val)
cdef class Float32HashTable(HashTable):
cdef kh_float32_t *table
cpdef get_item(self, float32_t val)
cpdef set_item(self, float32_t key, Py_ssize_t val)
cdef class Complex64HashTable(HashTable):
cdef kh_complex64_t *table
cpdef get_item(self, complex64_t val)
cpdef set_item(self, complex64_t key, Py_ssize_t val)
cdef class Complex128HashTable(HashTable):
cdef kh_complex128_t *table
cpdef get_item(self, complex128_t val)
cpdef set_item(self, complex128_t key, Py_ssize_t val)
cdef class PyObjectHashTable(HashTable):
cdef kh_pymap_t *table
cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)
cdef class StringHashTable(HashTable):
cdef kh_str_t *table
cpdef get_item(self, str val)
cpdef set_item(self, str key, Py_ssize_t val)
cdef struct Int64VectorData:
int64_t *data
Py_ssize_t n, m
cdef class Vector:
cdef bint external_view_exists
cdef class Int64Vector(Vector):
cdef Int64VectorData *data
cdef ndarray ao
cdef resize(self)
cpdef ndarray to_array(self)
cdef inline void append(self, int64_t x)
cdef extend(self, int64_t[:] x)
from typing import (
Any,
Hashable,
Literal,
)
import numpy as np
def unique_label_indices(
labels: np.ndarray, # const int64_t[:]
) -> np.ndarray: ...
class Factorizer:
count: int
def __init__(self, size_hint: int): ...
def get_count(self) -> int: ...
class ObjectFactorizer(Factorizer):
table: PyObjectHashTable
uniques: ObjectVector
def factorize(
self,
values: np.ndarray, # ndarray[object]
sort: bool = ...,
na_sentinel=...,
na_value=...,
) -> np.ndarray: ... # np.ndarray[intp]
class Int64Factorizer(Factorizer):
table: Int64HashTable
uniques: Int64Vector
def factorize(
self,
values: np.ndarray, # const int64_t[:]
sort: bool = ...,
na_sentinel=...,
na_value=...,
) -> np.ndarray: ... # np.ndarray[intp]
class Int64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int64]
class Int32Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int32]
class Int16Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int16]
class Int8Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int8]
class UInt64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint64]
class UInt32Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint32]
class UInt16Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint16]
class UInt8Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint8]
class Float64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.float64]
class Float32Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.float32]
class Complex128Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex128]
class Complex64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex64]
class StringVector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[object]
class ObjectVector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[object]
class HashTable:
# NB: The base HashTable class does _not_ actually have these methods;
# we are putting the here for the sake of mypy to avoid
# reproducing them in each subclass below.
def __init__(self, size_hint: int = ...): ...
def __len__(self) -> int: ...
def __contains__(self, key: Hashable) -> bool: ...
def sizeof(self, deep: bool = ...) -> int: ...
def get_state(self) -> dict[str, int]: ...
# TODO: `item` type is subclass-specific
def get_item(self, item): ... # TODO: return type?
def set_item(self, item) -> None: ...
# FIXME: we don't actually have this for StringHashTable or ObjectHashTable?
def map(
self,
keys: np.ndarray, # np.ndarray[subclass-specific]
values: np.ndarray, # const int64_t[:]
) -> None: ...
def map_locations(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
) -> None: ...
def lookup(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
) -> np.ndarray: ... # np.ndarray[np.intp]
def get_labels(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
uniques, # SubclassTypeVector
count_prior: int = ...,
na_sentinel: int = ...,
na_value: object = ...,
) -> np.ndarray: ... # np.ndarray[intp_t]
def unique(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
return_inverse: bool = ...,
) -> tuple[
np.ndarray, # np.ndarray[subclass-specific]
np.ndarray, # np.ndarray[np.intp],
] | np.ndarray: ... # np.ndarray[subclass-specific]
def _unique(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
uniques, # FooVector
count_prior: int = ...,
na_sentinel: int = ...,
na_value: object = ...,
ignore_na: bool = ...,
return_inverse: bool = ...,
) -> tuple[
np.ndarray, # np.ndarray[subclass-specific]
np.ndarray, # np.ndarray[np.intp],
] | np.ndarray: ... # np.ndarray[subclass-specific]
def factorize(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
na_sentinel: int = ...,
na_value: object = ...,
mask=...,
) -> tuple[
np.ndarray, # np.ndarray[subclass-specific]
np.ndarray, # np.ndarray[np.intp],
]: ...
class Complex128HashTable(HashTable): ...
class Complex64HashTable(HashTable): ...
class Float64HashTable(HashTable): ...
class Float32HashTable(HashTable): ...
class Int64HashTable(HashTable):
# Only Int64HashTable has get_labels_groupby
def get_labels_groupby(
self,
values: np.ndarray, # const int64_t[:]
) -> tuple[
np.ndarray, # np.ndarray[np.intp]
np.ndarray, # np.ndarray[np.int64]
]: ...
class Int32HashTable(HashTable): ...
class Int16HashTable(HashTable): ...
class Int8HashTable(HashTable): ...
class UInt64HashTable(HashTable): ...
class UInt32HashTable(HashTable): ...
class UInt16HashTable(HashTable): ...
class UInt8HashTable(HashTable): ...
class StringHashTable(HashTable): ...
class PyObjectHashTable(HashTable): ...
def duplicated_int64(
values: np.ndarray, # const int64_t[:] values
keep: Literal["last", "first", False] = ...,
) -> np.ndarray: ... # np.ndarray[bool]
# TODO: Is it actually bool or is it uint8?
def mode_int64(
values: np.ndarray, # const int64_t[:] values
dropna: bool,
) -> np.ndarray: ... # np.ndarray[np.int64]
def value_count_int64(
values: np.ndarray, # const int64_t[:]
dropna: bool,
) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.int64] # np.ndarray[np.int64]
def duplicated(
values: np.ndarray,
keep: Literal["last", "first", False] = ...,
) -> np.ndarray: ... # np.ndarray[bool]
def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ...
def value_count(
values: np.ndarray,
dropna: bool,
) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.int64]
# arr and values should have same dtype
def ismember(
arr: np.ndarray,
values: np.ndarray,
) -> np.ndarray: ... # np.ndarray[bool]
def object_hash(obj) -> int: ...
def objects_are_equal(a, b) -> bool: ...
cimport cython
from cpython.mem cimport (
PyMem_Free,
PyMem_Malloc,
)
from cpython.ref cimport (
Py_INCREF,
PyObject,
)
from libc.stdlib cimport (
free,
malloc,
)
import numpy as np
cimport numpy as cnp
from numpy cimport (
float64_t,
ndarray,
uint8_t,
uint32_t,
)
from numpy.math cimport NAN
cnp.import_array()
from pandas._libs cimport util
from pandas._libs.khash cimport (
KHASH_TRACE_DOMAIN,
are_equivalent_float32_t,
are_equivalent_float64_t,
are_equivalent_khcomplex64_t,
are_equivalent_khcomplex128_t,
kh_needed_n_buckets,
kh_python_hash_equal,
kh_python_hash_func,
kh_str_t,
khcomplex64_t,
khcomplex128_t,
khiter_t,
)
from pandas._libs.missing cimport checknull
def get_hashtable_trace_domain():
return KHASH_TRACE_DOMAIN
def object_hash(obj):
return kh_python_hash_func(obj)
def objects_are_equal(a, b):
return kh_python_hash_equal(a, b)
cdef int64_t NPY_NAT = util.get_nat()
SIZE_HINT_LIMIT = (1 << 20) + 7
cdef Py_ssize_t _INIT_VEC_CAP = 128
include "hashtable_class_helper.pxi"
include "hashtable_func_helper.pxi"
cdef class Factorizer:
cdef readonly:
Py_ssize_t count
def __cinit__(self, size_hint: int):
self.count = 0
def get_count(self) -> int:
return self.count
cdef class ObjectFactorizer(Factorizer):
cdef public:
PyObjectHashTable table
ObjectVector uniques
def __cinit__(self, size_hint: int):
self.table = PyObjectHashTable(size_hint)
self.uniques = ObjectVector()
def factorize(
self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
) -> np.ndarray:
"""
Returns
-------
np.ndarray[np.intp]
Examples
--------
Factorize values with nans replaced by na_sentinel
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
cdef:
ndarray[intp_t] labels
if self.uniques.external_view_exists:
uniques = ObjectVector()
uniques.extend(self.uniques.to_array())
self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel, na_value)
mask = (labels == na_sentinel)
# sort on
if sort:
sorter = self.uniques.to_array().argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
reverse_indexer.put(sorter, np.arange(len(sorter)))
labels = reverse_indexer.take(labels, mode='clip')
labels[mask] = na_sentinel
self.count = len(self.uniques)
return labels
cdef class Int64Factorizer(Factorizer):
cdef public:
Int64HashTable table
Int64Vector uniques
def __cinit__(self, size_hint: int):
self.table = Int64HashTable(size_hint)
self.uniques = Int64Vector()
def factorize(self, const int64_t[:] values, sort=False,
na_sentinel=-1, na_value=None) -> np.ndarray:
"""
Returns
-------
ndarray[intp_t]
Examples
--------
Factorize values with nans replaced by na_sentinel
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
cdef:
ndarray[intp_t] labels
if self.uniques.external_view_exists:
uniques = Int64Vector()
uniques.extend(self.uniques.to_array())
self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel,
na_value=na_value)
# sort on
if sort:
sorter = self.uniques.to_array().argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
reverse_indexer.put(sorter, np.arange(len(sorter)))
labels = reverse_indexer.take(labels)
self.count = len(self.uniques)
return labels
@cython.wraparound(False)
@cython.boundscheck(False)
def unique_label_indices(const int64_t[:] labels) -> ndarray:
"""
Indices of the first occurrences of the unique labels
*excluding* -1. equivalent to:
np.unique(labels, return_index=True)[1]
"""
cdef:
int ret = 0
Py_ssize_t i, n = len(labels)
kh_int64_t *table = kh_init_int64()
Int64Vector idx = Int64Vector()
ndarray[int64_t, ndim=1] arr
Int64VectorData *ud = idx.data
kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
with nogil:
for i in range(n):
kh_put_int64(table, labels[i], &ret)
if ret != 0:
if needs_resize(ud):
with gil:
idx.resize()
append_data_int64(ud, i)
kh_destroy_int64(table)
arr = idx.to_array()
arr = arr[np.asarray(labels)[arr].argsort()]
return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
import numpy as np
class IndexEngine:
over_size_threshold: bool
def __init__(self, vgetter, n: int): ...
def __contains__(self, val: object) -> bool: ...
# -> int | slice | np.ndarray[bool]
def get_loc(self, val: object) -> int | slice | np.ndarray: ...
def sizeof(self, deep: bool = False) -> int: ...
def __sizeof__(self) -> int: ...
@property
def is_unique(self) -> bool: ...
@property
def is_monotonic_increasing(self) -> bool: ...
@property
def is_monotonic_decreasing(self) -> bool: ...
def get_backfill_indexer(
self, other: np.ndarray, limit: int | None = ...
) -> np.ndarray: ...
def get_pad_indexer(
self, other: np.ndarray, limit: int | None = ...
) -> np.ndarray: ...
@property
def is_mapping_populated(self) -> bool: ...
def clear_mapping(self): ...
def get_indexer(self, values: np.ndarray) -> np.ndarray: ... # np.ndarray[np.intp]
def get_indexer_non_unique(
self,
targets: np.ndarray,
) -> tuple[
np.ndarray, # np.ndarray[np.intp]
np.ndarray, # np.ndarray[np.intp]
]: ...
class Float64Engine(IndexEngine): ...
class Float32Engine(IndexEngine): ...
class Int64Engine(IndexEngine): ...
class Int32Engine(IndexEngine): ...
class Int16Engine(IndexEngine): ...
class Int8Engine(IndexEngine): ...
class UInt64Engine(IndexEngine): ...
class UInt32Engine(IndexEngine): ...
class UInt16Engine(IndexEngine): ...
class UInt8Engine(IndexEngine): ...
class ObjectEngine(IndexEngine): ...
class DatetimeEngine(Int64Engine): ...
class TimedeltaEngine(DatetimeEngine): ...
class PeriodEngine(Int64Engine): ...
class BaseMultiIndexCodesEngine:
levels: list[np.ndarray]
offsets: np.ndarray # ndarray[uint64_t, ndim=1]
def __init__(
self,
levels: list[np.ndarray], # all entries hashable
labels: list[np.ndarray], # all entries integer-dtyped
offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1]
): ...
def get_indexer(
self,
target: np.ndarray, # np.ndarray[object]
) -> np.ndarray: ... # np.ndarray[np.intp]
def _extract_level_codes(self, target: object): ...
def get_indexer_with_fill(
self,
target: np.ndarray, # np.ndarray[object] of tuples
values: np.ndarray, # np.ndarray[object] of tuples
method: str,
limit: int | None,
) -> np.ndarray: ... # np.ndarray[np.int64]
This diff is collapsed.
cdef class NDFrameIndexerBase:
"""
A base class for _NDFrameIndexer for fast instantiation and attribute access.
"""
cdef public:
str name
object obj, _ndim
def __init__(self, name: str, obj):
self.obj = obj
self.name = name
self._ndim = None
@property
def ndim(self) -> int:
# Delay `ndim` instantiation until required as reading it
# from `obj` isn't entirely cheap.
ndim = self._ndim
if ndim is None:
ndim = self._ndim = self.obj.ndim
if ndim > 2:
raise ValueError(
"NDFrameIndexer does not support NDFrame objects with ndim > 2"
)
return ndim
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
cdef bint c_is_list_like(object, bint) except -1
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment