From 3bc14749a6a8fa43abe92a0631cd808ccec1a2ed Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Fri, 3 Jul 2026 10:21:11 -0700 Subject: [PATCH 1/5] first rendering --- src/__init__.py | 1 + src/__init__.pyi | 3 + src/_arraykit.c | 4 + src/auto_map.c | 324 +++++++++++++++++++++++++++++++++++++++++ src/auto_map.h | 2 + test/test_factorize.py | 274 ++++++++++++++++++++++++++++++++++ 6 files changed, 608 insertions(+) create mode 100644 test/test_factorize.py diff --git a/src/__init__.py b/src/__init__.py index 0f2505f4..65289266 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -25,6 +25,7 @@ from ._arraykit import split_after_count as split_after_count from ._arraykit import get_new_indexers_and_screen as get_new_indexers_and_screen from ._arraykit import write_array_to_file as write_array_to_file +from ._arraykit import factorize as factorize from ._arraykit import count_iteration as count_iteration from ._arraykit import first_true_1d as first_true_1d from ._arraykit import first_true_2d as first_true_2d diff --git a/src/__init__.pyi b/src/__init__.pyi index c20e75a9..8065bcf2 100644 --- a/src/__init__.pyi +++ b/src/__init__.pyi @@ -224,6 +224,9 @@ def write_array_to_file( fortran_order: bool = False, buffersize: int = 8192, ) -> None: ... +def factorize( + array: np.ndarray, *, sort: bool = ... +) -> tp.Tuple[np.ndarray, np.ndarray]: ... def first_true_1d(__array: np.ndarray, *, forward: bool) -> int: ... def first_true_2d(__array: np.ndarray, *, forward: bool, axis: int) -> np.ndarray: ... def nonzero_1d(__array: np.ndarray, /) -> np.ndarray: ... diff --git a/src/_arraykit.c b/src/_arraykit.c index 297d1fa8..d04315e6 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -70,6 +70,10 @@ static PyMethodDef arraykit_methods[] = { (PyCFunction)write_array_to_file, METH_VARARGS | METH_KEYWORDS, NULL}, + {"factorize", + (PyCFunction)factorize, + METH_VARARGS | METH_KEYWORDS, + NULL}, {NULL}, }; diff --git a/src/auto_map.c b/src/auto_map.c index cc05795a..e2d9a96c 100644 --- a/src/auto_map.c +++ b/src/auto_map.c @@ -2520,6 +2520,330 @@ fam_init(PyObject *self, PyObject *args, PyObject *kwargs) # undef INSERT_FLEXIBLE +//------------------------------------------------------------------------------ +// factorize + +// Detect a Python-level float NaN (a Python float or a NumPy floating scalar) so +// that object-dtype arrays collapse all NaN into a single code, matching the +// float-dtype behavior. Only real floats are treated as NaN here; None, NaT, and +// complex NaN remain ordinary distinct keys. Returns 1 for float NaN, else 0. +static inline int +factorize_obj_is_float_nan(PyObject* key) +{ + if (PyFloat_Check(key)) { + return isnan(PyFloat_AS_DOUBLE(key)); + } + if (PyArray_IsScalar(key, Half)) { + return npy_half_isnan(PyArrayScalar_VAL(key, Half)); + } + if (PyArray_IsScalar(key, Float32)) { + return isnan(PyArrayScalar_VAL(key, Float32)); + } + if (PyArray_IsScalar(key, Float64)) { + return isnan(PyArrayScalar_VAL(key, Float64)); + } + return 0; +} + +// Given a probe result `pos_expr` and its `hash_val` for the element at index +// `i`, either assign a new sequential code (empty slot) or reuse the code of the +// already-seen key (occupied slot). Stores the first-occurrence input index in +// the table so the reused lookup_hash_* probes compare against the right value. +// Requires `scratch`, `codes`, `code_of_index`, `first_index`, `k`, `i` in scope. +# define FACTORIZE_RECORD(pos_expr, hash_val) \ +{ \ + Py_ssize_t _pos = (pos_expr); \ + if (_pos < 0) { \ + goto fail; \ + } \ + if (scratch.table[_pos].hash == -1) { \ + scratch.table[_pos].keys_pos = i; \ + scratch.table[_pos].hash = (hash_val); \ + first_index[k] = i; \ + code_of_index[i] = k; \ + codes[i] = k; \ + k++; \ + } \ + else { \ + codes[i] = code_of_index[scratch.table[_pos].keys_pos]; \ + } \ +} \ + +# define FACTORIZE_INT(npy_type, kat_lookup) \ +{ \ + for (npy_intp i = 0; i < n; i++) { \ + npy_int64 v = (npy_int64)*(npy_type*)PyArray_GETPTR1(a, i); \ + Py_hash_t hash = int_to_hash(v); \ + FACTORIZE_RECORD(lookup_hash_int(&scratch, v, hash, kat_lookup), hash); \ + } \ +} \ + +# define FACTORIZE_UINT(npy_type, kat_lookup) \ +{ \ + for (npy_intp i = 0; i < n; i++) { \ + npy_uint64 v = (npy_uint64)*(npy_type*)PyArray_GETPTR1(a, i); \ + Py_hash_t hash = uint_to_hash(v); \ + FACTORIZE_RECORD(lookup_hash_uint(&scratch, v, hash, kat_lookup), hash); \ + } \ +} \ + +// Floats need explicit NaN handling: all NaN collapse to one code and never +// enter the table (the map compares with `==`, so NaN would otherwise never +// match itself). +0.0/-0.0 and inf are handled correctly by the normal path. +# define FACTORIZE_FLOAT(npy_type, kat_lookup, post_deref) \ +{ \ + for (npy_intp i = 0; i < n; i++) { \ + npy_double v = post_deref(*(npy_type*)PyArray_GETPTR1(a, i)); \ + if (v != v) { \ + if (nan_code < 0) { \ + nan_code = k; \ + first_index[k] = i; \ + codes[i] = k; \ + k++; \ + } \ + else { \ + codes[i] = nan_code; \ + } \ + continue; \ + } \ + Py_hash_t hash = double_to_hash(v); \ + FACTORIZE_RECORD(lookup_hash_double(&scratch, v, hash, kat_lookup), hash); \ + } \ +} \ + +# define FACTORIZE_FLEXIBLE(char_type, lookup_func, hash_func, get_end_func, dt_size_expr) \ +{ \ + Py_ssize_t dt_size = (dt_size_expr); \ + for (npy_intp i = 0; i < n; i++) { \ + char_type* v = (char_type*)PyArray_GETPTR1(a, i); \ + Py_ssize_t ksize = get_end_func(v, dt_size) - v; \ + Py_hash_t hash = hash_func(v, ksize); \ + FACTORIZE_RECORD(lookup_func(&scratch, v, ksize, hash), hash); \ + } \ +} \ + +// Hash-based factorize: return (uniques, codes) such that +// array[i] == uniques[codes[i]], in O(n), reusing the AutoMap hash table. +PyObject * +factorize(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = {"array", "sort", NULL}; + PyObject *array_obj = NULL; + int sort = 0; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, + "O|$p:factorize", kwlist, + &array_obj, + &sort)) { + return NULL; + } + if (!PyArray_Check(array_obj)) { + PyErr_Format(PyExc_TypeError, + "Expected a NumPy array, not %s.", Py_TYPE(array_obj)->tp_name); + return NULL; + } + PyArrayObject *a = (PyArrayObject *)array_obj; + if (PyArray_NDIM(a) != 1) { + PyErr_SetString(PyExc_TypeError, "Array must be 1-dimensional"); + return NULL; + } + + npy_intp n = PyArray_SIZE(a); + int array_t = PyArray_TYPE(a); + KeysArrayType kat = at_to_kat(array_t, a); + + PyObject *codes_arr = NULL; + PyObject *idx_arr = NULL; + PyObject *uniques = NULL; + PyObject *list = NULL; // object/KAT_LIST path only + npy_intp *code_of_index = NULL; // input index -> code + npy_intp *first_index = NULL; // code -> first-occurrence input index + npy_intp *rank = NULL; // sort remap + Py_ssize_t k = 0; // running distinct count + Py_ssize_t nan_code = -1; // shared code for all NaN, -1 until seen + + // A private, stack-allocated scratch table; never exposed to Python. Only + // the fields read by grow_table / lookup_hash_* are set. + FAMObject scratch; + scratch.table = NULL; + scratch.table_size = 0; + scratch.keys = NULL; + scratch.keys_array_type = kat; + scratch.keys_size = n; + scratch.key_buffer = NULL; // never needed by the array lookups + + codes_arr = PyArray_EMPTY(1, &n, NPY_INTP, 0); + if (!codes_arr) { + goto fail; + } + npy_intp *codes = (npy_intp*)PyArray_DATA((PyArrayObject*)codes_arr); + + if (n > 0) { + code_of_index = PyMem_New(npy_intp, n); + first_index = PyMem_New(npy_intp, n); + if (!code_of_index || !first_index) { + PyErr_NoMemory(); + goto fail; + } + } + + // The lookups dereference scratch.keys at stored input indices. For a usable + // array KAT that is the (borrowed) input array; otherwise build a list. + if (kat) { + scratch.keys = array_obj; + } + else { + if (array_t == NPY_DATETIME || array_t == NPY_TIMEDELTA) { + list = PySequence_List(array_obj); + } + else { + list = PyArray_ToList(a); + } + if (!list) { + goto fail; + } + scratch.keys = list; + } + + if (grow_table(&scratch, n)) { + goto fail; + } + + switch (kat) { + case KAT_INT8: FACTORIZE_INT(npy_int8, KAT_INT8); break; + case KAT_INT16: FACTORIZE_INT(npy_int16, KAT_INT16); break; + case KAT_INT32: FACTORIZE_INT(npy_int32, KAT_INT32); break; + case KAT_INT64: FACTORIZE_INT(npy_int64, KAT_INT64); break; + case KAT_UINT8: FACTORIZE_UINT(npy_uint8, KAT_UINT8); break; + case KAT_UINT16: FACTORIZE_UINT(npy_uint16, KAT_UINT16); break; + case KAT_UINT32: FACTORIZE_UINT(npy_uint32, KAT_UINT32); break; + case KAT_UINT64: FACTORIZE_UINT(npy_uint64, KAT_UINT64); break; + case KAT_FLOAT16: FACTORIZE_FLOAT(npy_half, KAT_FLOAT16, npy_half_to_double); break; + case KAT_FLOAT32: FACTORIZE_FLOAT(npy_float, KAT_FLOAT32, ); break; + case KAT_FLOAT64: FACTORIZE_FLOAT(npy_double, KAT_FLOAT64, ); break; + case KAT_UNICODE: + FACTORIZE_FLEXIBLE(Py_UCS4, lookup_hash_unicode, unicode_to_hash, + ucs4_get_end_p, PyArray_ITEMSIZE(a) / UCS4_SIZE); + break; + case KAT_STRING: + FACTORIZE_FLEXIBLE(char, lookup_hash_string, string_to_hash, + char_get_end_p, PyArray_ITEMSIZE(a)); + break; + case KAT_DTY: + case KAT_DTM: + case KAT_DTW: + case KAT_DTD: + case KAT_DTh: + case KAT_DTm: + case KAT_DTs: + case KAT_DTms: + case KAT_DTus: + case KAT_DTns: + case KAT_DTps: + case KAT_DTfs: + case KAT_DTas: + // datetime64/timedelta64 store an int64; NaT (INT64_MIN) compares + // equal to itself here, so all NaT collapse into one code naturally. + FACTORIZE_INT(npy_int64, KAT_INT64); + break; + default: { // KAT_LIST: object dtype, complex, dt64 without a unit + for (npy_intp i = 0; i < n; i++) { + PyObject* key = PyList_GET_ITEM(list, i); // borrowed + if (factorize_obj_is_float_nan(key)) { + if (nan_code < 0) { + nan_code = k; + first_index[k] = i; + codes[i] = k; + k++; + } + else { + codes[i] = nan_code; + } + continue; + } + Py_hash_t hash = PyObject_Hash(key); + if (hash == -1) { + goto fail; + } + FACTORIZE_RECORD(lookup_hash_obj(&scratch, key, hash), hash); + } + break; + } + } + + // uniques: take the first-occurrence values out of the input, same dtype. + idx_arr = PyArray_EMPTY(1, &k, NPY_INTP, 0); + if (!idx_arr) { + goto fail; + } + if (k > 0) { + memcpy(PyArray_DATA((PyArrayObject*)idx_arr), first_index, k * sizeof(npy_intp)); + } + uniques = PyArray_TakeFrom(a, idx_arr, 0, NULL, NPY_RAISE); + if (!uniques) { + goto fail; + } + + if (sort && k > 1) { + PyObject* order = PyArray_ArgSort((PyArrayObject*)uniques, 0, NPY_QUICKSORT); + if (!order) { + goto fail; + } + npy_intp* order_data = (npy_intp*)PyArray_DATA((PyArrayObject*)order); + rank = PyMem_New(npy_intp, k); + if (!rank) { + Py_DECREF(order); + PyErr_NoMemory(); + goto fail; + } + for (npy_intp j = 0; j < k; j++) { + rank[order_data[j]] = j; + } + for (npy_intp i = 0; i < n; i++) { + codes[i] = rank[codes[i]]; + } + PyObject* uniques_sorted = PyArray_TakeFrom( + (PyArrayObject*)uniques, order, 0, NULL, NPY_RAISE); + Py_DECREF(order); + if (!uniques_sorted) { + goto fail; + } + Py_SETREF(uniques, uniques_sorted); + } + + PyArray_CLEARFLAGS((PyArrayObject*)codes_arr, NPY_ARRAY_WRITEABLE); + PyArray_CLEARFLAGS((PyArrayObject*)uniques, NPY_ARRAY_WRITEABLE); + + PyMem_Free(scratch.table); + PyMem_Free(code_of_index); + PyMem_Free(first_index); + PyMem_Free(rank); + Py_DECREF(idx_arr); + Py_XDECREF(list); + + PyObject* result = PyTuple_Pack(2, uniques, codes_arr); + Py_DECREF(uniques); + Py_DECREF(codes_arr); + return result; + +fail: + PyMem_Free(scratch.table); + PyMem_Free(code_of_index); + PyMem_Free(first_index); + PyMem_Free(rank); + Py_XDECREF(idx_arr); + Py_XDECREF(list); + Py_XDECREF(codes_arr); + Py_XDECREF(uniques); + return NULL; +} + +# undef FACTORIZE_RECORD +# undef FACTORIZE_INT +# undef FACTORIZE_UINT +# undef FACTORIZE_FLOAT +# undef FACTORIZE_FLEXIBLE + + static PyObject * fam_repr(FAMObject *self) { diff --git a/src/auto_map.h b/src/auto_map.h index fe9a03b5..87feb5b1 100644 --- a/src/auto_map.h +++ b/src/auto_map.h @@ -10,5 +10,7 @@ extern PyTypeObject FAMVType; extern PyTypeObject FAMType; extern PyObject *NonUniqueError; +PyObject *factorize(PyObject *m, PyObject *args, PyObject *kwargs); + # endif /* ARRAYKIT_SRC_AUTO_MAP_H_ */ diff --git a/test/test_factorize.py b/test/test_factorize.py new file mode 100644 index 00000000..630d09a5 --- /dev/null +++ b/test/test_factorize.py @@ -0,0 +1,274 @@ +import unittest + +import numpy as np + +from arraykit import factorize + + +def _eq_elem(x, y): + # NaN/NaT-aware scalar equality (nan != nan under ==) + try: + if x != x and y != y: + return True + except (TypeError, ValueError): + pass + try: + return bool(x == y) + except (TypeError, ValueError): + return x is y + + +def roundtrips(uniques, codes, array): + reconstructed = uniques[codes] + if len(array) != len(codes): + return False + return all(_eq_elem(reconstructed[i], array[i]) for i in range(len(array))) + + +class TestUnit(unittest.TestCase): + # ------------------------------------------------------------------ + # basic behavior + + def test_factorize_int_a(self) -> None: + a = np.array([10, 10, 10, 20, 20, 30]) + uniques, codes = factorize(a) + self.assertEqual(uniques.tolist(), [10, 20, 30]) # first-appearance order + self.assertEqual(codes.tolist(), [0, 0, 0, 1, 1, 2]) + self.assertTrue(roundtrips(uniques, codes, a)) + + def test_factorize_first_appearance_order(self) -> None: + a = np.array([30, 10, 10, 20, 30]) + uniques, codes = factorize(a) + self.assertEqual(uniques.tolist(), [30, 10, 20]) + self.assertEqual(codes.tolist(), [0, 1, 1, 2, 0]) + + def test_factorize_codes_dtype_intp(self) -> None: + _, codes = factorize(np.array([1, 2, 3])) + self.assertEqual(codes.dtype, np.dtype(np.intp)) + + def test_factorize_codes_in_range(self) -> None: + a = np.arange(50) % 7 + uniques, codes = factorize(a) + self.assertTrue((codes >= 0).all()) + self.assertTrue((codes < len(uniques)).all()) + + def test_factorize_outputs_immutable(self) -> None: + uniques, codes = factorize(np.array([1, 1, 2])) + self.assertFalse(uniques.flags.writeable) + self.assertFalse(codes.flags.writeable) + + # ------------------------------------------------------------------ + # dtype coverage (each hash path) + + def test_factorize_int_widths(self) -> None: + for dt in (np.int8, np.int16, np.int32, np.int64): + a = np.array([1, 1, 2, -3, -3, 2], dtype=dt) + uniques, codes = factorize(a) + self.assertTrue(roundtrips(uniques, codes, a), dt) + self.assertEqual(uniques.dtype, np.dtype(dt)) + + def test_factorize_uint_widths(self) -> None: + for dt in (np.uint8, np.uint16, np.uint32, np.uint64): + a = np.array([1, 1, 2, 200, 200], dtype=dt) + uniques, codes = factorize(a) + self.assertTrue(roundtrips(uniques, codes, a), dt) + + def test_factorize_float64(self) -> None: + a = np.array([1.5, 1.5, 2.5, 0.0, -0.0, np.inf, -np.inf, np.inf]) + uniques, codes = factorize(a) + self.assertTrue(roundtrips(uniques, codes, a)) + # +0.0 and -0.0 are equal -> one group + self.assertEqual(codes[3], codes[4]) + # +inf and -inf are distinct + self.assertNotEqual(codes[5], codes[6]) + self.assertEqual(codes[5], codes[7]) + + def test_factorize_float32(self) -> None: + a = np.array([1, 1, 2, np.nan, np.nan], dtype=np.float32) + uniques, codes = factorize(a) + self.assertTrue(roundtrips(uniques, codes, a)) + + def test_factorize_float16(self) -> None: + a = np.array([1, 1, 2, 2, 3, np.nan, np.nan], dtype=np.float16) + uniques, codes = factorize(a) + self.assertTrue(roundtrips(uniques, codes, a)) + self.assertEqual(codes[5], codes[6]) # NaN collapse + + def test_factorize_unicode(self) -> None: + a = np.array(['a', 'a', 'bb', 'b', 'b', 'a']) + uniques, codes = factorize(a) + self.assertEqual(uniques.tolist(), ['a', 'bb', 'b']) + self.assertEqual(codes.tolist(), [0, 0, 1, 2, 2, 0]) + self.assertTrue(roundtrips(uniques, codes, a)) + + def test_factorize_bytes(self) -> None: + a = np.array([b'a', b'a', b'bb', b'b'], dtype='S2') + uniques, codes = factorize(a) + self.assertTrue(roundtrips(uniques, codes, a)) + + def test_factorize_datetime64(self) -> None: + a = np.array( + ['2020-01-01', '2020-01-01', '2021-01-01', '2020-01-01'], + dtype='datetime64[D]', + ) + uniques, codes = factorize(a) + self.assertEqual(codes.tolist(), [0, 0, 1, 0]) + self.assertTrue(roundtrips(uniques, codes, a)) + + def test_factorize_timedelta64(self) -> None: + a = np.array([5, 5, 10, 10, 5], dtype='timedelta64[D]') + uniques, codes = factorize(a) + self.assertTrue(roundtrips(uniques, codes, a)) + + # ------------------------------------------------------------------ + # NaN / NaT semantics + + def test_factorize_float_nan_one_group(self) -> None: + a = np.array([1.0, np.nan, np.nan, 2.0, np.nan]) + uniques, codes = factorize(a) + nan_codes = {codes[1], codes[2], codes[4]} + self.assertEqual(len(nan_codes), 1) # all NaN share one code + self.assertEqual(np.isnan(uniques).sum(), 1) # exactly one NaN unique + self.assertTrue(roundtrips(uniques, codes, a)) + + def test_factorize_datetime_nat_one_group(self) -> None: + nat = np.datetime64('NaT') + a = np.array([nat, nat, np.datetime64('2020'), nat], dtype='datetime64[Y]') + _, codes = factorize(a) + self.assertEqual(codes[0], codes[1]) + self.assertEqual(codes[0], codes[3]) + self.assertNotEqual(codes[0], codes[2]) + + def test_factorize_object_nan_matches_float(self) -> None: + a = np.array( + ['a', 'a', float('nan'), float('nan'), None, None, 1, 1], + dtype=object, + ) + uniques, codes = factorize(a) + self.assertTrue(roundtrips(uniques, codes, a)) + self.assertEqual(codes[2], codes[3]) # float('nan') collapse + self.assertEqual(codes[4], codes[5]) # None is its own group + self.assertNotEqual(codes[2], codes[4]) # nan != None + + def test_factorize_object_mixed_types(self) -> None: + a = np.array([1, '1', 1.0, 2], dtype=object) + uniques, codes = factorize(a) + self.assertTrue(roundtrips(uniques, codes, a)) + self.assertNotEqual(codes[0], codes[1]) # 1 (int) != '1' (str) + self.assertEqual(codes[0], codes[2]) # 1 == 1.0 by value + + # ------------------------------------------------------------------ + # sort=True + + def test_factorize_sort_int(self) -> None: + a = np.array([30, 10, 10, 20, 30]) + uniques, codes = factorize(a, sort=True) + self.assertEqual(uniques.tolist(), [10, 20, 30]) + self.assertTrue(roundtrips(uniques, codes, a)) + + def test_factorize_sort_reconstruction_matches_unsorted(self) -> None: + a = np.array([3.0, np.nan, 1.0, np.nan, 2.0, 1.0]) + u0, c0 = factorize(a, sort=False) + u1, c1 = factorize(a, sort=True) + # both reconstruct the same original array + self.assertTrue(roundtrips(u0, c0, a)) + self.assertTrue(roundtrips(u1, c1, a)) + # sorted uniques are ascending (NaN sorts last) + finite = u1[~np.isnan(u1)] + self.assertEqual(finite.tolist(), sorted(finite.tolist())) + + def test_factorize_sort_strings(self) -> None: + a = np.array(['c', 'a', 'b', 'a', 'c']) + uniques, codes = factorize(a, sort=True) + self.assertEqual(uniques.tolist(), ['a', 'b', 'c']) + self.assertTrue(roundtrips(uniques, codes, a)) + + def test_factorize_sort_is_keyword_only(self) -> None: + a = np.array([1, 2, 3]) + with self.assertRaises(TypeError): + factorize(a, True) # type: ignore + + def test_factorize_sort_unorderable_object_raises(self) -> None: + a = np.array([1, 'a', 2], dtype=object) + with self.assertRaises(TypeError): + factorize(a, sort=True) + + # ------------------------------------------------------------------ + # edge cases + + def test_factorize_empty(self) -> None: + for a in (np.array([], dtype=np.int64), np.array([], dtype=float), + np.array([], dtype='U4'), np.array([], dtype=object)): + uniques, codes = factorize(a) + self.assertEqual(len(uniques), 0) + self.assertEqual(len(codes), 0) + self.assertEqual(codes.dtype, np.dtype(np.intp)) + self.assertEqual(uniques.dtype, a.dtype) + + def test_factorize_single(self) -> None: + uniques, codes = factorize(np.array([7])) + self.assertEqual(uniques.tolist(), [7]) + self.assertEqual(codes.tolist(), [0]) + + def test_factorize_all_unique(self) -> None: + a = np.array([1, 2, 3, 4]) + uniques, codes = factorize(a) + self.assertEqual(codes.tolist(), [0, 1, 2, 3]) + self.assertEqual(uniques.tolist(), [1, 2, 3, 4]) + + def test_factorize_all_same(self) -> None: + a = np.array([5, 5, 5, 5]) + uniques, codes = factorize(a) + self.assertEqual(uniques.tolist(), [5]) + self.assertEqual(codes.tolist(), [0, 0, 0, 0]) + + def test_factorize_non_contiguous(self) -> None: + # underlying [1,1,2,2,3,3]; [::2] view is [1,2,3] + a = np.array([1, 1, 2, 2, 3, 3], dtype=np.int64)[::2] + self.assertFalse(a.flags.c_contiguous) + uniques, codes = factorize(a) + self.assertEqual(uniques.tolist(), [1, 2, 3]) + self.assertEqual(codes.tolist(), [0, 1, 2]) + + def test_factorize_mutable_input_accepted(self) -> None: + a = np.array([1, 1, 2]) + self.assertTrue(a.flags.writeable) + uniques, codes = factorize(a) # must not require immutable input + self.assertTrue(roundtrips(uniques, codes, a)) + + # ------------------------------------------------------------------ + # input validation + + def test_factorize_requires_array(self) -> None: + with self.assertRaises(TypeError): + factorize([1, 2, 3]) # type: ignore + + def test_factorize_requires_1d(self) -> None: + with self.assertRaises(TypeError): + factorize(np.arange(4).reshape(2, 2)) + + # ------------------------------------------------------------------ + # parity with numpy sort-based unique/inverse + + def test_factorize_parity_np_unique(self) -> None: + rng = np.random.RandomState(0) + for _ in range(50): + a = rng.randint(0, 6, size=rng.randint(0, 40)).astype(np.int64) + uniques, codes = factorize(a, sort=True) + u_np, inv_np = np.unique(a, return_inverse=True) + self.assertEqual(uniques.tolist(), u_np.tolist()) + self.assertEqual(codes.tolist(), inv_np.ravel().tolist()) + + def test_factorize_parity_np_unique_strings(self) -> None: + rng = np.random.RandomState(1) + pool = np.array(['aa', 'bb', 'cc', 'dd', 'ee']) + for _ in range(30): + a = pool[rng.randint(0, 5, size=rng.randint(1, 40))] + uniques, codes = factorize(a, sort=True) + u_np, inv_np = np.unique(a, return_inverse=True) + self.assertEqual(uniques.tolist(), u_np.tolist()) + self.assertEqual(codes.tolist(), inv_np.ravel().tolist()) + + +if __name__ == '__main__': + unittest.main() From 96ca7a5ece204f5a41ca132ecbd0ea6707dacd3f Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Fri, 3 Jul 2026 18:30:21 -0700 Subject: [PATCH 2/5] improved macros for contiguity --- src/auto_map.c | 140 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 91 insertions(+), 49 deletions(-) diff --git a/src/auto_map.c b/src/auto_map.c index e2d9a96c..3ff46566 100644 --- a/src/auto_map.c +++ b/src/auto_map.c @@ -2569,58 +2569,95 @@ factorize_obj_is_float_nan(PyObject* key) } \ } \ -# define FACTORIZE_INT(npy_type, kat_lookup) \ -{ \ - for (npy_intp i = 0; i < n; i++) { \ - npy_int64 v = (npy_int64)*(npy_type*)PyArray_GETPTR1(a, i); \ - Py_hash_t hash = int_to_hash(v); \ - FACTORIZE_RECORD(lookup_hash_int(&scratch, v, hash, kat_lookup), hash); \ - } \ -} \ - -# define FACTORIZE_UINT(npy_type, kat_lookup) \ -{ \ - for (npy_intp i = 0; i < n; i++) { \ - npy_uint64 v = (npy_uint64)*(npy_type*)PyArray_GETPTR1(a, i); \ - Py_hash_t hash = uint_to_hash(v); \ - FACTORIZE_RECORD(lookup_hash_uint(&scratch, v, hash, kat_lookup), hash); \ - } \ -} \ - -// Floats need explicit NaN handling: all NaN collapse to one code and never -// enter the table (the map compares with `==`, so NaN would otherwise never -// match itself). +0.0/-0.0 and inf are handled correctly by the normal path. -# define FACTORIZE_FLOAT(npy_type, kat_lookup, post_deref) \ -{ \ - for (npy_intp i = 0; i < n; i++) { \ - npy_double v = post_deref(*(npy_type*)PyArray_GETPTR1(a, i)); \ - if (v != v) { \ - if (nan_code < 0) { \ - nan_code = k; \ - first_index[k] = i; \ - codes[i] = k; \ - k++; \ - } \ - else { \ - codes[i] = nan_code; \ - } \ - continue; \ +// Integer/unsigned scalar factorize loop. When the input is C-contiguous, index +// a typed pointer (`b[i]`, compile-time itemsize) instead of PyArray_GETPTR1's +// runtime stride multiply -- materially faster, matching INSERT_SCALARS. +# define FACTORIZE_SCALAR_LOOP(npy_type, value_t, hash_func, lookup_func, kat_lookup) \ +{ \ + if (contiguous) { \ + const npy_type* b = (const npy_type*)PyArray_DATA(a); \ + for (npy_intp i = 0; i < n; i++) { \ + value_t v = (value_t)b[i]; \ + Py_hash_t hash = hash_func(v); \ + FACTORIZE_RECORD(lookup_func(&scratch, v, hash, kat_lookup), hash); \ + } \ + } \ + else { \ + for (npy_intp i = 0; i < n; i++) { \ + value_t v = (value_t)*(const npy_type*)PyArray_GETPTR1(a, i); \ + Py_hash_t hash = hash_func(v); \ + FACTORIZE_RECORD(lookup_func(&scratch, v, hash, kat_lookup), hash); \ } \ - Py_hash_t hash = double_to_hash(v); \ + } \ +} \ + +# define FACTORIZE_INT(npy_type, kat_lookup) \ + FACTORIZE_SCALAR_LOOP(npy_type, npy_int64, int_to_hash, lookup_hash_int, kat_lookup) + +# define FACTORIZE_UINT(npy_type, kat_lookup) \ + FACTORIZE_SCALAR_LOOP(npy_type, npy_uint64, uint_to_hash, lookup_hash_uint, kat_lookup) + +// Per-element float body. All NaN collapse to one code and never enter the table +// (the map compares with `==`, so NaN would otherwise never match itself); +// +0.0/-0.0 and inf are handled correctly by the normal path. +# define FACTORIZE_FLOAT_ELEM(value_expr, kat_lookup) \ +{ \ + npy_double v = (value_expr); \ + if (v != v) { \ + if (nan_code < 0) { \ + nan_code = k; \ + first_index[k] = i; \ + codes[i] = k; \ + k++; \ + } \ + else { \ + codes[i] = nan_code; \ + } \ + } \ + else { \ + Py_hash_t hash = double_to_hash(v); \ FACTORIZE_RECORD(lookup_hash_double(&scratch, v, hash, kat_lookup), hash); \ - } \ -} \ - + } \ +} \ + +# define FACTORIZE_FLOAT(npy_type, kat_lookup, post_deref) \ +{ \ + if (contiguous) { \ + const npy_type* b = (const npy_type*)PyArray_DATA(a); \ + for (npy_intp i = 0; i < n; i++) { \ + FACTORIZE_FLOAT_ELEM(post_deref(b[i]), kat_lookup) \ + } \ + } \ + else { \ + for (npy_intp i = 0; i < n; i++) { \ + FACTORIZE_FLOAT_ELEM(post_deref(*(const npy_type*)PyArray_GETPTR1(a, i)), kat_lookup) \ + } \ + } \ +} \ + +// Flexible (unicode/string) loop. In the contiguous case step a running pointer +// by dt_size (incremental add, no per-element multiply), matching INSERT_FLEXIBLE. # define FACTORIZE_FLEXIBLE(char_type, lookup_func, hash_func, get_end_func, dt_size_expr) \ -{ \ - Py_ssize_t dt_size = (dt_size_expr); \ - for (npy_intp i = 0; i < n; i++) { \ - char_type* v = (char_type*)PyArray_GETPTR1(a, i); \ - Py_ssize_t ksize = get_end_func(v, dt_size) - v; \ - Py_hash_t hash = hash_func(v, ksize); \ - FACTORIZE_RECORD(lookup_func(&scratch, v, ksize, hash), hash); \ - } \ -} \ +{ \ + Py_ssize_t dt_size = (dt_size_expr); \ + if (contiguous) { \ + char_type* v = (char_type*)PyArray_DATA(a); \ + for (npy_intp i = 0; i < n; i++) { \ + Py_ssize_t ksize = get_end_func(v, dt_size) - v; \ + Py_hash_t hash = hash_func(v, ksize); \ + FACTORIZE_RECORD(lookup_func(&scratch, v, ksize, hash), hash); \ + v += dt_size; \ + } \ + } \ + else { \ + for (npy_intp i = 0; i < n; i++) { \ + char_type* v = (char_type*)PyArray_GETPTR1(a, i); \ + Py_ssize_t ksize = get_end_func(v, dt_size) - v; \ + Py_hash_t hash = hash_func(v, ksize); \ + FACTORIZE_RECORD(lookup_func(&scratch, v, ksize, hash), hash); \ + } \ + } \ +} \ // Hash-based factorize: return (uniques, codes) such that // array[i] == uniques[codes[i]], in O(n), reusing the AutoMap hash table. @@ -2708,6 +2745,9 @@ factorize(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) goto fail; } + // Enables the typed-pointer fast path in the scalar/flexible loops below. + int contiguous = PyArray_IS_C_CONTIGUOUS(a); + switch (kat) { case KAT_INT8: FACTORIZE_INT(npy_int8, KAT_INT8); break; case KAT_INT16: FACTORIZE_INT(npy_int16, KAT_INT16); break; @@ -2838,8 +2878,10 @@ factorize(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) } # undef FACTORIZE_RECORD +# undef FACTORIZE_SCALAR_LOOP # undef FACTORIZE_INT # undef FACTORIZE_UINT +# undef FACTORIZE_FLOAT_ELEM # undef FACTORIZE_FLOAT # undef FACTORIZE_FLEXIBLE From e89336784d12d330ffb0f08b8ddc463745459085 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Fri, 3 Jul 2026 18:35:54 -0700 Subject: [PATCH 3/5] updated pointer usage --- src/auto_map.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/auto_map.c b/src/auto_map.c index 3ff46566..ee1791b8 100644 --- a/src/auto_map.c +++ b/src/auto_map.c @@ -2576,10 +2576,14 @@ factorize_obj_is_float_nan(PyObject* key) { \ if (contiguous) { \ const npy_type* b = (const npy_type*)PyArray_DATA(a); \ - for (npy_intp i = 0; i < n; i++) { \ - value_t v = (value_t)b[i]; \ + const npy_type* b_end = b + n; \ + npy_intp i = 0; \ + while (b < b_end) { \ + value_t v = (value_t)*b; \ Py_hash_t hash = hash_func(v); \ FACTORIZE_RECORD(lookup_func(&scratch, v, hash, kat_lookup), hash); \ + b++; \ + i++; \ } \ } \ else { \ @@ -2624,8 +2628,12 @@ factorize_obj_is_float_nan(PyObject* key) { \ if (contiguous) { \ const npy_type* b = (const npy_type*)PyArray_DATA(a); \ - for (npy_intp i = 0; i < n; i++) { \ - FACTORIZE_FLOAT_ELEM(post_deref(b[i]), kat_lookup) \ + const npy_type* b_end = b + n; \ + npy_intp i = 0; \ + while (b < b_end) { \ + FACTORIZE_FLOAT_ELEM(post_deref(*b), kat_lookup) \ + b++; \ + i++; \ } \ } \ else { \ From 5c7361530b757cba7b7f9aaf5ba966216a2a862b Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Fri, 3 Jul 2026 20:33:27 -0700 Subject: [PATCH 4/5] enhancements --- src/auto_map.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/auto_map.c b/src/auto_map.c index ee1791b8..64d1299d 100644 --- a/src/auto_map.c +++ b/src/auto_map.c @@ -2523,6 +2523,31 @@ fam_init(PyObject *self, PyObject *args, PyObject *kwargs) //------------------------------------------------------------------------------ // factorize +// A fast float hash for factorize only. Unlike double_to_hash (CPython- +// compatible, so hash(1.0)==hash(1), via frexp + a loop), factorize never needs +// cross-type hashing -- it only compares floats to floats from the same array by +// `==`. So we canonicalize -0.0 to +0.0 (they compare equal, must hash equal), +// reinterpret the bits, and apply a splitmix64 finalizer for good avalanche. +// NaN never reaches here (handled before probing); +/-inf hash distinctly, which +// is correct since they are distinct values. lookup_hash_double takes the hash as +// a parameter, so this stays self-consistent within factorize's scratch table. +static inline Py_hash_t +factorize_double_to_hash(double v) +{ + if (v == 0.0) { + v = 0.0; // collapse -0.0 to +0.0 + } + npy_uint64 x; + memcpy(&x, &v, sizeof(x)); + x ^= x >> 33; + x *= 0xff51afd7ed558ccdULL; + x ^= x >> 33; + x *= 0xc4ceb9fe1a85ec53ULL; + x ^= x >> 33; + Py_hash_t h = (Py_hash_t)x; + return h == -1 ? -2 : h; // -1 marks an empty slot +} + // Detect a Python-level float NaN (a Python float or a NumPy floating scalar) so // that object-dtype arrays collapse all NaN into a single code, matching the // float-dtype behavior. Only real floats are treated as NaN here; None, NaT, and @@ -2619,7 +2644,7 @@ factorize_obj_is_float_nan(PyObject* key) } \ } \ else { \ - Py_hash_t hash = double_to_hash(v); \ + Py_hash_t hash = factorize_double_to_hash(v); \ FACTORIZE_RECORD(lookup_hash_double(&scratch, v, hash, kat_lookup), hash); \ } \ } \ From aef055636cf5aa8e1cb1970bc9e0088c5435366c Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Fri, 3 Jul 2026 21:31:00 -0700 Subject: [PATCH 5/5] format --- src/auto_map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/auto_map.c b/src/auto_map.c index 64d1299d..4dbe6b3a 100644 --- a/src/auto_map.c +++ b/src/auto_map.c @@ -2918,6 +2918,7 @@ factorize(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) # undef FACTORIZE_FLOAT # undef FACTORIZE_FLEXIBLE +//------------------------------------------------------------------------------ static PyObject * fam_repr(FAMObject *self)