From 3bc14749a6a8fa43abe92a0631cd808ccec1a2ed Mon Sep 17 00:00:00 2001
From: Christopher Ariza <ariza@flexatone.com>
Date: Fri, 3 Jul 2026 10:21:11 -0700
Subject: [PATCH 1/5] first rendering

---
 src/__init__.py        |   1 +
 src/__init__.pyi       |   3 +
 src/_arraykit.c        |   4 +
 src/auto_map.c         | 324 +++++++++++++++++++++++++++++++++++++++++
 src/auto_map.h         |   2 +
 test/test_factorize.py | 274 ++++++++++++++++++++++++++++++++++
 6 files changed, 608 insertions(+)
 create mode 100644 test/test_factorize.py

diff --git a/src/__init__.py b/src/__init__.py
index 0f2505f4..65289266 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -25,6 +25,7 @@
 from ._arraykit import split_after_count as split_after_count
 from ._arraykit import get_new_indexers_and_screen as get_new_indexers_and_screen
 from ._arraykit import write_array_to_file as write_array_to_file
+from ._arraykit import factorize as factorize
 from ._arraykit import count_iteration as count_iteration
 from ._arraykit import first_true_1d as first_true_1d
 from ._arraykit import first_true_2d as first_true_2d
diff --git a/src/__init__.pyi b/src/__init__.pyi
index c20e75a9..8065bcf2 100644
--- a/src/__init__.pyi
+++ b/src/__init__.pyi
@@ -224,6 +224,9 @@ def write_array_to_file(
     fortran_order: bool = False,
     buffersize: int = 8192,
 ) -> None: ...
+def factorize(
+    array: np.ndarray, *, sort: bool = ...
+) -> tp.Tuple[np.ndarray, np.ndarray]: ...
 def first_true_1d(__array: np.ndarray, *, forward: bool) -> int: ...
 def first_true_2d(__array: np.ndarray, *, forward: bool, axis: int) -> np.ndarray: ...
 def nonzero_1d(__array: np.ndarray, /) -> np.ndarray: ...
diff --git a/src/_arraykit.c b/src/_arraykit.c
index 297d1fa8..d04315e6 100644
--- a/src/_arraykit.c
+++ b/src/_arraykit.c
@@ -70,6 +70,10 @@ static PyMethodDef arraykit_methods[] =  {
             (PyCFunction)write_array_to_file,
             METH_VARARGS | METH_KEYWORDS,
             NULL},
+    {"factorize",
+            (PyCFunction)factorize,
+            METH_VARARGS | METH_KEYWORDS,
+            NULL},
     {NULL},
 };
 
diff --git a/src/auto_map.c b/src/auto_map.c
index cc05795a..e2d9a96c 100644
--- a/src/auto_map.c
+++ b/src/auto_map.c
@@ -2520,6 +2520,330 @@ fam_init(PyObject *self, PyObject *args, PyObject *kwargs)
 # undef INSERT_FLEXIBLE
 
 
+//------------------------------------------------------------------------------
+// factorize
+
+// Detect a Python-level float NaN (a Python float or a NumPy floating scalar) so
+// that object-dtype arrays collapse all NaN into a single code, matching the
+// float-dtype behavior. Only real floats are treated as NaN here; None, NaT, and
+// complex NaN remain ordinary distinct keys. Returns 1 for float NaN, else 0.
+static inline int
+factorize_obj_is_float_nan(PyObject* key)
+{
+    if (PyFloat_Check(key)) {
+        return isnan(PyFloat_AS_DOUBLE(key));
+    }
+    if (PyArray_IsScalar(key, Half)) {
+        return npy_half_isnan(PyArrayScalar_VAL(key, Half));
+    }
+    if (PyArray_IsScalar(key, Float32)) {
+        return isnan(PyArrayScalar_VAL(key, Float32));
+    }
+    if (PyArray_IsScalar(key, Float64)) {
+        return isnan(PyArrayScalar_VAL(key, Float64));
+    }
+    return 0;
+}
+
+// Given a probe result `pos_expr` and its `hash_val` for the element at index
+// `i`, either assign a new sequential code (empty slot) or reuse the code of the
+// already-seen key (occupied slot). Stores the first-occurrence input index in
+// the table so the reused lookup_hash_* probes compare against the right value.
+// Requires `scratch`, `codes`, `code_of_index`, `first_index`, `k`, `i` in scope.
+# define FACTORIZE_RECORD(pos_expr, hash_val)                          \
+{                                                                      \
+    Py_ssize_t _pos = (pos_expr);                                      \
+    if (_pos < 0) {                                                    \
+        goto fail;                                                     \
+    }                                                                  \
+    if (scratch.table[_pos].hash == -1) {                             \
+        scratch.table[_pos].keys_pos = i;                             \
+        scratch.table[_pos].hash = (hash_val);                        \
+        first_index[k] = i;                                           \
+        code_of_index[i] = k;                                         \
+        codes[i] = k;                                                 \
+        k++;                                                          \
+    }                                                                 \
+    else {                                                            \
+        codes[i] = code_of_index[scratch.table[_pos].keys_pos];      \
+    }                                                                 \
+}                                                                     \
+
+# define FACTORIZE_INT(npy_type, kat_lookup)                          \
+{                                                                      \
+    for (npy_intp i = 0; i < n; i++) {                                \
+        npy_int64 v = (npy_int64)*(npy_type*)PyArray_GETPTR1(a, i);   \
+        Py_hash_t hash = int_to_hash(v);                             \
+        FACTORIZE_RECORD(lookup_hash_int(&scratch, v, hash, kat_lookup), hash); \
+    }                                                                 \
+}                                                                     \
+
+# define FACTORIZE_UINT(npy_type, kat_lookup)                         \
+{                                                                      \
+    for (npy_intp i = 0; i < n; i++) {                                \
+        npy_uint64 v = (npy_uint64)*(npy_type*)PyArray_GETPTR1(a, i); \
+        Py_hash_t hash = uint_to_hash(v);                            \
+        FACTORIZE_RECORD(lookup_hash_uint(&scratch, v, hash, kat_lookup), hash); \
+    }                                                                 \
+}                                                                     \
+
+// Floats need explicit NaN handling: all NaN collapse to one code and never
+// enter the table (the map compares with `==`, so NaN would otherwise never
+// match itself). +0.0/-0.0 and inf are handled correctly by the normal path.
+# define FACTORIZE_FLOAT(npy_type, kat_lookup, post_deref)            \
+{                                                                      \
+    for (npy_intp i = 0; i < n; i++) {                                \
+        npy_double v = post_deref(*(npy_type*)PyArray_GETPTR1(a, i)); \
+        if (v != v) {                                                 \
+            if (nan_code < 0) {                                       \
+                nan_code = k;                                         \
+                first_index[k] = i;                                  \
+                codes[i] = k;                                        \
+                k++;                                                 \
+            }                                                        \
+            else {                                                   \
+                codes[i] = nan_code;                                 \
+            }                                                        \
+            continue;                                                \
+        }                                                            \
+        Py_hash_t hash = double_to_hash(v);                         \
+        FACTORIZE_RECORD(lookup_hash_double(&scratch, v, hash, kat_lookup), hash); \
+    }                                                                 \
+}                                                                     \
+
+# define FACTORIZE_FLEXIBLE(char_type, lookup_func, hash_func, get_end_func, dt_size_expr) \
+{                                                                      \
+    Py_ssize_t dt_size = (dt_size_expr);                              \
+    for (npy_intp i = 0; i < n; i++) {                                \
+        char_type* v = (char_type*)PyArray_GETPTR1(a, i);            \
+        Py_ssize_t ksize = get_end_func(v, dt_size) - v;             \
+        Py_hash_t hash = hash_func(v, ksize);                        \
+        FACTORIZE_RECORD(lookup_func(&scratch, v, ksize, hash), hash); \
+    }                                                                 \
+}                                                                     \
+
+// Hash-based factorize: return (uniques, codes) such that
+// array[i] == uniques[codes[i]], in O(n), reusing the AutoMap hash table.
+PyObject *
+factorize(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
+{
+    static char *kwlist[] = {"array", "sort", NULL};
+    PyObject *array_obj = NULL;
+    int sort = 0;
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs,
+            "O|$p:factorize", kwlist,
+            &array_obj,
+            &sort)) {
+        return NULL;
+    }
+    if (!PyArray_Check(array_obj)) {
+        PyErr_Format(PyExc_TypeError,
+                "Expected a NumPy array, not %s.", Py_TYPE(array_obj)->tp_name);
+        return NULL;
+    }
+    PyArrayObject *a = (PyArrayObject *)array_obj;
+    if (PyArray_NDIM(a) != 1) {
+        PyErr_SetString(PyExc_TypeError, "Array must be 1-dimensional");
+        return NULL;
+    }
+
+    npy_intp n = PyArray_SIZE(a);
+    int array_t = PyArray_TYPE(a);
+    KeysArrayType kat = at_to_kat(array_t, a);
+
+    PyObject *codes_arr = NULL;
+    PyObject *idx_arr = NULL;
+    PyObject *uniques = NULL;
+    PyObject *list = NULL;             // object/KAT_LIST path only
+    npy_intp *code_of_index = NULL;    // input index -> code
+    npy_intp *first_index = NULL;      // code -> first-occurrence input index
+    npy_intp *rank = NULL;             // sort remap
+    Py_ssize_t k = 0;                  // running distinct count
+    Py_ssize_t nan_code = -1;          // shared code for all NaN, -1 until seen
+
+    // A private, stack-allocated scratch table; never exposed to Python. Only
+    // the fields read by grow_table / lookup_hash_* are set.
+    FAMObject scratch;
+    scratch.table = NULL;
+    scratch.table_size = 0;
+    scratch.keys = NULL;
+    scratch.keys_array_type = kat;
+    scratch.keys_size = n;
+    scratch.key_buffer = NULL;         // never needed by the array lookups
+
+    codes_arr = PyArray_EMPTY(1, &n, NPY_INTP, 0);
+    if (!codes_arr) {
+        goto fail;
+    }
+    npy_intp *codes = (npy_intp*)PyArray_DATA((PyArrayObject*)codes_arr);
+
+    if (n > 0) {
+        code_of_index = PyMem_New(npy_intp, n);
+        first_index = PyMem_New(npy_intp, n);
+        if (!code_of_index || !first_index) {
+            PyErr_NoMemory();
+            goto fail;
+        }
+    }
+
+    // The lookups dereference scratch.keys at stored input indices. For a usable
+    // array KAT that is the (borrowed) input array; otherwise build a list.
+    if (kat) {
+        scratch.keys = array_obj;
+    }
+    else {
+        if (array_t == NPY_DATETIME || array_t == NPY_TIMEDELTA) {
+            list = PySequence_List(array_obj);
+        }
+        else {
+            list = PyArray_ToList(a);
+        }
+        if (!list) {
+            goto fail;
+        }
+        scratch.keys = list;
+    }
+
+    if (grow_table(&scratch, n)) {
+        goto fail;
+    }
+
+    switch (kat) {
+        case KAT_INT8:   FACTORIZE_INT(npy_int8,   KAT_INT8);   break;
+        case KAT_INT16:  FACTORIZE_INT(npy_int16,  KAT_INT16);  break;
+        case KAT_INT32:  FACTORIZE_INT(npy_int32,  KAT_INT32);  break;
+        case KAT_INT64:  FACTORIZE_INT(npy_int64,  KAT_INT64);  break;
+        case KAT_UINT8:  FACTORIZE_UINT(npy_uint8,  KAT_UINT8);  break;
+        case KAT_UINT16: FACTORIZE_UINT(npy_uint16, KAT_UINT16); break;
+        case KAT_UINT32: FACTORIZE_UINT(npy_uint32, KAT_UINT32); break;
+        case KAT_UINT64: FACTORIZE_UINT(npy_uint64, KAT_UINT64); break;
+        case KAT_FLOAT16: FACTORIZE_FLOAT(npy_half, KAT_FLOAT16, npy_half_to_double); break;
+        case KAT_FLOAT32: FACTORIZE_FLOAT(npy_float, KAT_FLOAT32, ); break;
+        case KAT_FLOAT64: FACTORIZE_FLOAT(npy_double, KAT_FLOAT64, ); break;
+        case KAT_UNICODE:
+            FACTORIZE_FLEXIBLE(Py_UCS4, lookup_hash_unicode, unicode_to_hash,
+                    ucs4_get_end_p, PyArray_ITEMSIZE(a) / UCS4_SIZE);
+            break;
+        case KAT_STRING:
+            FACTORIZE_FLEXIBLE(char, lookup_hash_string, string_to_hash,
+                    char_get_end_p, PyArray_ITEMSIZE(a));
+            break;
+        case KAT_DTY:
+        case KAT_DTM:
+        case KAT_DTW:
+        case KAT_DTD:
+        case KAT_DTh:
+        case KAT_DTm:
+        case KAT_DTs:
+        case KAT_DTms:
+        case KAT_DTus:
+        case KAT_DTns:
+        case KAT_DTps:
+        case KAT_DTfs:
+        case KAT_DTas:
+            // datetime64/timedelta64 store an int64; NaT (INT64_MIN) compares
+            // equal to itself here, so all NaT collapse into one code naturally.
+            FACTORIZE_INT(npy_int64, KAT_INT64);
+            break;
+        default: { // KAT_LIST: object dtype, complex, dt64 without a unit
+            for (npy_intp i = 0; i < n; i++) {
+                PyObject* key = PyList_GET_ITEM(list, i); // borrowed
+                if (factorize_obj_is_float_nan(key)) {
+                    if (nan_code < 0) {
+                        nan_code = k;
+                        first_index[k] = i;
+                        codes[i] = k;
+                        k++;
+                    }
+                    else {
+                        codes[i] = nan_code;
+                    }
+                    continue;
+                }
+                Py_hash_t hash = PyObject_Hash(key);
+                if (hash == -1) {
+                    goto fail;
+                }
+                FACTORIZE_RECORD(lookup_hash_obj(&scratch, key, hash), hash);
+            }
+            break;
+        }
+    }
+
+    // uniques: take the first-occurrence values out of the input, same dtype.
+    idx_arr = PyArray_EMPTY(1, &k, NPY_INTP, 0);
+    if (!idx_arr) {
+        goto fail;
+    }
+    if (k > 0) {
+        memcpy(PyArray_DATA((PyArrayObject*)idx_arr), first_index, k * sizeof(npy_intp));
+    }
+    uniques = PyArray_TakeFrom(a, idx_arr, 0, NULL, NPY_RAISE);
+    if (!uniques) {
+        goto fail;
+    }
+
+    if (sort && k > 1) {
+        PyObject* order = PyArray_ArgSort((PyArrayObject*)uniques, 0, NPY_QUICKSORT);
+        if (!order) {
+            goto fail;
+        }
+        npy_intp* order_data = (npy_intp*)PyArray_DATA((PyArrayObject*)order);
+        rank = PyMem_New(npy_intp, k);
+        if (!rank) {
+            Py_DECREF(order);
+            PyErr_NoMemory();
+            goto fail;
+        }
+        for (npy_intp j = 0; j < k; j++) {
+            rank[order_data[j]] = j;
+        }
+        for (npy_intp i = 0; i < n; i++) {
+            codes[i] = rank[codes[i]];
+        }
+        PyObject* uniques_sorted = PyArray_TakeFrom(
+                (PyArrayObject*)uniques, order, 0, NULL, NPY_RAISE);
+        Py_DECREF(order);
+        if (!uniques_sorted) {
+            goto fail;
+        }
+        Py_SETREF(uniques, uniques_sorted);
+    }
+
+    PyArray_CLEARFLAGS((PyArrayObject*)codes_arr, NPY_ARRAY_WRITEABLE);
+    PyArray_CLEARFLAGS((PyArrayObject*)uniques, NPY_ARRAY_WRITEABLE);
+
+    PyMem_Free(scratch.table);
+    PyMem_Free(code_of_index);
+    PyMem_Free(first_index);
+    PyMem_Free(rank);
+    Py_DECREF(idx_arr);
+    Py_XDECREF(list);
+
+    PyObject* result = PyTuple_Pack(2, uniques, codes_arr);
+    Py_DECREF(uniques);
+    Py_DECREF(codes_arr);
+    return result;
+
+fail:
+    PyMem_Free(scratch.table);
+    PyMem_Free(code_of_index);
+    PyMem_Free(first_index);
+    PyMem_Free(rank);
+    Py_XDECREF(idx_arr);
+    Py_XDECREF(list);
+    Py_XDECREF(codes_arr);
+    Py_XDECREF(uniques);
+    return NULL;
+}
+
+# undef FACTORIZE_RECORD
+# undef FACTORIZE_INT
+# undef FACTORIZE_UINT
+# undef FACTORIZE_FLOAT
+# undef FACTORIZE_FLEXIBLE
+
+
 static PyObject *
 fam_repr(FAMObject *self)
 {
diff --git a/src/auto_map.h b/src/auto_map.h
index fe9a03b5..87feb5b1 100644
--- a/src/auto_map.h
+++ b/src/auto_map.h
@@ -10,5 +10,7 @@ extern PyTypeObject FAMVType;
 extern PyTypeObject FAMType;
 extern PyObject *NonUniqueError;
 
+PyObject *factorize(PyObject *m, PyObject *args, PyObject *kwargs);
+
 
 # endif /* ARRAYKIT_SRC_AUTO_MAP_H_ */
diff --git a/test/test_factorize.py b/test/test_factorize.py
new file mode 100644
index 00000000..630d09a5
--- /dev/null
+++ b/test/test_factorize.py
@@ -0,0 +1,274 @@
+import unittest
+
+import numpy as np
+
+from arraykit import factorize
+
+
+def _eq_elem(x, y):
+    # NaN/NaT-aware scalar equality (nan != nan under ==)
+    try:
+        if x != x and y != y:
+            return True
+    except (TypeError, ValueError):
+        pass
+    try:
+        return bool(x == y)
+    except (TypeError, ValueError):
+        return x is y
+
+
+def roundtrips(uniques, codes, array):
+    reconstructed = uniques[codes]
+    if len(array) != len(codes):
+        return False
+    return all(_eq_elem(reconstructed[i], array[i]) for i in range(len(array)))
+
+
+class TestUnit(unittest.TestCase):
+    # ------------------------------------------------------------------
+    # basic behavior
+
+    def test_factorize_int_a(self) -> None:
+        a = np.array([10, 10, 10, 20, 20, 30])
+        uniques, codes = factorize(a)
+        self.assertEqual(uniques.tolist(), [10, 20, 30])  # first-appearance order
+        self.assertEqual(codes.tolist(), [0, 0, 0, 1, 1, 2])
+        self.assertTrue(roundtrips(uniques, codes, a))
+
+    def test_factorize_first_appearance_order(self) -> None:
+        a = np.array([30, 10, 10, 20, 30])
+        uniques, codes = factorize(a)
+        self.assertEqual(uniques.tolist(), [30, 10, 20])
+        self.assertEqual(codes.tolist(), [0, 1, 1, 2, 0])
+
+    def test_factorize_codes_dtype_intp(self) -> None:
+        _, codes = factorize(np.array([1, 2, 3]))
+        self.assertEqual(codes.dtype, np.dtype(np.intp))
+
+    def test_factorize_codes_in_range(self) -> None:
+        a = np.arange(50) % 7
+        uniques, codes = factorize(a)
+        self.assertTrue((codes >= 0).all())
+        self.assertTrue((codes < len(uniques)).all())
+
+    def test_factorize_outputs_immutable(self) -> None:
+        uniques, codes = factorize(np.array([1, 1, 2]))
+        self.assertFalse(uniques.flags.writeable)
+        self.assertFalse(codes.flags.writeable)
+
+    # ------------------------------------------------------------------
+    # dtype coverage (each hash path)
+
+    def test_factorize_int_widths(self) -> None:
+        for dt in (np.int8, np.int16, np.int32, np.int64):
+            a = np.array([1, 1, 2, -3, -3, 2], dtype=dt)
+            uniques, codes = factorize(a)
+            self.assertTrue(roundtrips(uniques, codes, a), dt)
+            self.assertEqual(uniques.dtype, np.dtype(dt))
+
+    def test_factorize_uint_widths(self) -> None:
+        for dt in (np.uint8, np.uint16, np.uint32, np.uint64):
+            a = np.array([1, 1, 2, 200, 200], dtype=dt)
+            uniques, codes = factorize(a)
+            self.assertTrue(roundtrips(uniques, codes, a), dt)
+
+    def test_factorize_float64(self) -> None:
+        a = np.array([1.5, 1.5, 2.5, 0.0, -0.0, np.inf, -np.inf, np.inf])
+        uniques, codes = factorize(a)
+        self.assertTrue(roundtrips(uniques, codes, a))
+        # +0.0 and -0.0 are equal -> one group
+        self.assertEqual(codes[3], codes[4])
+        # +inf and -inf are distinct
+        self.assertNotEqual(codes[5], codes[6])
+        self.assertEqual(codes[5], codes[7])
+
+    def test_factorize_float32(self) -> None:
+        a = np.array([1, 1, 2, np.nan, np.nan], dtype=np.float32)
+        uniques, codes = factorize(a)
+        self.assertTrue(roundtrips(uniques, codes, a))
+
+    def test_factorize_float16(self) -> None:
+        a = np.array([1, 1, 2, 2, 3, np.nan, np.nan], dtype=np.float16)
+        uniques, codes = factorize(a)
+        self.assertTrue(roundtrips(uniques, codes, a))
+        self.assertEqual(codes[5], codes[6])  # NaN collapse
+
+    def test_factorize_unicode(self) -> None:
+        a = np.array(['a', 'a', 'bb', 'b', 'b', 'a'])
+        uniques, codes = factorize(a)
+        self.assertEqual(uniques.tolist(), ['a', 'bb', 'b'])
+        self.assertEqual(codes.tolist(), [0, 0, 1, 2, 2, 0])
+        self.assertTrue(roundtrips(uniques, codes, a))
+
+    def test_factorize_bytes(self) -> None:
+        a = np.array([b'a', b'a', b'bb', b'b'], dtype='S2')
+        uniques, codes = factorize(a)
+        self.assertTrue(roundtrips(uniques, codes, a))
+
+    def test_factorize_datetime64(self) -> None:
+        a = np.array(
+            ['2020-01-01', '2020-01-01', '2021-01-01', '2020-01-01'],
+            dtype='datetime64[D]',
+        )
+        uniques, codes = factorize(a)
+        self.assertEqual(codes.tolist(), [0, 0, 1, 0])
+        self.assertTrue(roundtrips(uniques, codes, a))
+
+    def test_factorize_timedelta64(self) -> None:
+        a = np.array([5, 5, 10, 10, 5], dtype='timedelta64[D]')
+        uniques, codes = factorize(a)
+        self.assertTrue(roundtrips(uniques, codes, a))
+
+    # ------------------------------------------------------------------
+    # NaN / NaT semantics
+
+    def test_factorize_float_nan_one_group(self) -> None:
+        a = np.array([1.0, np.nan, np.nan, 2.0, np.nan])
+        uniques, codes = factorize(a)
+        nan_codes = {codes[1], codes[2], codes[4]}
+        self.assertEqual(len(nan_codes), 1)  # all NaN share one code
+        self.assertEqual(np.isnan(uniques).sum(), 1)  # exactly one NaN unique
+        self.assertTrue(roundtrips(uniques, codes, a))
+
+    def test_factorize_datetime_nat_one_group(self) -> None:
+        nat = np.datetime64('NaT')
+        a = np.array([nat, nat, np.datetime64('2020'), nat], dtype='datetime64[Y]')
+        _, codes = factorize(a)
+        self.assertEqual(codes[0], codes[1])
+        self.assertEqual(codes[0], codes[3])
+        self.assertNotEqual(codes[0], codes[2])
+
+    def test_factorize_object_nan_matches_float(self) -> None:
+        a = np.array(
+            ['a', 'a', float('nan'), float('nan'), None, None, 1, 1],
+            dtype=object,
+        )
+        uniques, codes = factorize(a)
+        self.assertTrue(roundtrips(uniques, codes, a))
+        self.assertEqual(codes[2], codes[3])       # float('nan') collapse
+        self.assertEqual(codes[4], codes[5])       # None is its own group
+        self.assertNotEqual(codes[2], codes[4])    # nan != None
+
+    def test_factorize_object_mixed_types(self) -> None:
+        a = np.array([1, '1', 1.0, 2], dtype=object)
+        uniques, codes = factorize(a)
+        self.assertTrue(roundtrips(uniques, codes, a))
+        self.assertNotEqual(codes[0], codes[1])    # 1 (int) != '1' (str)
+        self.assertEqual(codes[0], codes[2])       # 1 == 1.0 by value
+
+    # ------------------------------------------------------------------
+    # sort=True
+
+    def test_factorize_sort_int(self) -> None:
+        a = np.array([30, 10, 10, 20, 30])
+        uniques, codes = factorize(a, sort=True)
+        self.assertEqual(uniques.tolist(), [10, 20, 30])
+        self.assertTrue(roundtrips(uniques, codes, a))
+
+    def test_factorize_sort_reconstruction_matches_unsorted(self) -> None:
+        a = np.array([3.0, np.nan, 1.0, np.nan, 2.0, 1.0])
+        u0, c0 = factorize(a, sort=False)
+        u1, c1 = factorize(a, sort=True)
+        # both reconstruct the same original array
+        self.assertTrue(roundtrips(u0, c0, a))
+        self.assertTrue(roundtrips(u1, c1, a))
+        # sorted uniques are ascending (NaN sorts last)
+        finite = u1[~np.isnan(u1)]
+        self.assertEqual(finite.tolist(), sorted(finite.tolist()))
+
+    def test_factorize_sort_strings(self) -> None:
+        a = np.array(['c', 'a', 'b', 'a', 'c'])
+        uniques, codes = factorize(a, sort=True)
+        self.assertEqual(uniques.tolist(), ['a', 'b', 'c'])
+        self.assertTrue(roundtrips(uniques, codes, a))
+
+    def test_factorize_sort_is_keyword_only(self) -> None:
+        a = np.array([1, 2, 3])
+        with self.assertRaises(TypeError):
+            factorize(a, True)  # type: ignore
+
+    def test_factorize_sort_unorderable_object_raises(self) -> None:
+        a = np.array([1, 'a', 2], dtype=object)
+        with self.assertRaises(TypeError):
+            factorize(a, sort=True)
+
+    # ------------------------------------------------------------------
+    # edge cases
+
+    def test_factorize_empty(self) -> None:
+        for a in (np.array([], dtype=np.int64), np.array([], dtype=float),
+                  np.array([], dtype='U4'), np.array([], dtype=object)):
+            uniques, codes = factorize(a)
+            self.assertEqual(len(uniques), 0)
+            self.assertEqual(len(codes), 0)
+            self.assertEqual(codes.dtype, np.dtype(np.intp))
+            self.assertEqual(uniques.dtype, a.dtype)
+
+    def test_factorize_single(self) -> None:
+        uniques, codes = factorize(np.array([7]))
+        self.assertEqual(uniques.tolist(), [7])
+        self.assertEqual(codes.tolist(), [0])
+
+    def test_factorize_all_unique(self) -> None:
+        a = np.array([1, 2, 3, 4])
+        uniques, codes = factorize(a)
+        self.assertEqual(codes.tolist(), [0, 1, 2, 3])
+        self.assertEqual(uniques.tolist(), [1, 2, 3, 4])
+
+    def test_factorize_all_same(self) -> None:
+        a = np.array([5, 5, 5, 5])
+        uniques, codes = factorize(a)
+        self.assertEqual(uniques.tolist(), [5])
+        self.assertEqual(codes.tolist(), [0, 0, 0, 0])
+
+    def test_factorize_non_contiguous(self) -> None:
+        # underlying [1,1,2,2,3,3]; [::2] view is [1,2,3]
+        a = np.array([1, 1, 2, 2, 3, 3], dtype=np.int64)[::2]
+        self.assertFalse(a.flags.c_contiguous)
+        uniques, codes = factorize(a)
+        self.assertEqual(uniques.tolist(), [1, 2, 3])
+        self.assertEqual(codes.tolist(), [0, 1, 2])
+
+    def test_factorize_mutable_input_accepted(self) -> None:
+        a = np.array([1, 1, 2])
+        self.assertTrue(a.flags.writeable)
+        uniques, codes = factorize(a)  # must not require immutable input
+        self.assertTrue(roundtrips(uniques, codes, a))
+
+    # ------------------------------------------------------------------
+    # input validation
+
+    def test_factorize_requires_array(self) -> None:
+        with self.assertRaises(TypeError):
+            factorize([1, 2, 3])  # type: ignore
+
+    def test_factorize_requires_1d(self) -> None:
+        with self.assertRaises(TypeError):
+            factorize(np.arange(4).reshape(2, 2))
+
+    # ------------------------------------------------------------------
+    # parity with numpy sort-based unique/inverse
+
+    def test_factorize_parity_np_unique(self) -> None:
+        rng = np.random.RandomState(0)
+        for _ in range(50):
+            a = rng.randint(0, 6, size=rng.randint(0, 40)).astype(np.int64)
+            uniques, codes = factorize(a, sort=True)
+            u_np, inv_np = np.unique(a, return_inverse=True)
+            self.assertEqual(uniques.tolist(), u_np.tolist())
+            self.assertEqual(codes.tolist(), inv_np.ravel().tolist())
+
+    def test_factorize_parity_np_unique_strings(self) -> None:
+        rng = np.random.RandomState(1)
+        pool = np.array(['aa', 'bb', 'cc', 'dd', 'ee'])
+        for _ in range(30):
+            a = pool[rng.randint(0, 5, size=rng.randint(1, 40))]
+            uniques, codes = factorize(a, sort=True)
+            u_np, inv_np = np.unique(a, return_inverse=True)
+            self.assertEqual(uniques.tolist(), u_np.tolist())
+            self.assertEqual(codes.tolist(), inv_np.ravel().tolist())
+
+
+if __name__ == '__main__':
+    unittest.main()

From 96ca7a5ece204f5a41ca132ecbd0ea6707dacd3f Mon Sep 17 00:00:00 2001
From: Christopher Ariza <ariza@flexatone.com>
Date: Fri, 3 Jul 2026 18:30:21 -0700
Subject: [PATCH 2/5] improved macros for contiguity

---
 src/auto_map.c | 140 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 91 insertions(+), 49 deletions(-)

diff --git a/src/auto_map.c b/src/auto_map.c
index e2d9a96c..3ff46566 100644
--- a/src/auto_map.c
+++ b/src/auto_map.c
@@ -2569,58 +2569,95 @@ factorize_obj_is_float_nan(PyObject* key)
     }                                                                 \
 }                                                                     \
 
-# define FACTORIZE_INT(npy_type, kat_lookup)                          \
-{                                                                      \
-    for (npy_intp i = 0; i < n; i++) {                                \
-        npy_int64 v = (npy_int64)*(npy_type*)PyArray_GETPTR1(a, i);   \
-        Py_hash_t hash = int_to_hash(v);                             \
-        FACTORIZE_RECORD(lookup_hash_int(&scratch, v, hash, kat_lookup), hash); \
-    }                                                                 \
-}                                                                     \
-
-# define FACTORIZE_UINT(npy_type, kat_lookup)                         \
-{                                                                      \
-    for (npy_intp i = 0; i < n; i++) {                                \
-        npy_uint64 v = (npy_uint64)*(npy_type*)PyArray_GETPTR1(a, i); \
-        Py_hash_t hash = uint_to_hash(v);                            \
-        FACTORIZE_RECORD(lookup_hash_uint(&scratch, v, hash, kat_lookup), hash); \
-    }                                                                 \
-}                                                                     \
-
-// Floats need explicit NaN handling: all NaN collapse to one code and never
-// enter the table (the map compares with `==`, so NaN would otherwise never
-// match itself). +0.0/-0.0 and inf are handled correctly by the normal path.
-# define FACTORIZE_FLOAT(npy_type, kat_lookup, post_deref)            \
-{                                                                      \
-    for (npy_intp i = 0; i < n; i++) {                                \
-        npy_double v = post_deref(*(npy_type*)PyArray_GETPTR1(a, i)); \
-        if (v != v) {                                                 \
-            if (nan_code < 0) {                                       \
-                nan_code = k;                                         \
-                first_index[k] = i;                                  \
-                codes[i] = k;                                        \
-                k++;                                                 \
-            }                                                        \
-            else {                                                   \
-                codes[i] = nan_code;                                 \
-            }                                                        \
-            continue;                                                \
+// Integer/unsigned scalar factorize loop. When the input is C-contiguous, index
+// a typed pointer (`b[i]`, compile-time itemsize) instead of PyArray_GETPTR1's
+// runtime stride multiply -- materially faster, matching INSERT_SCALARS.
+# define FACTORIZE_SCALAR_LOOP(npy_type, value_t, hash_func, lookup_func, kat_lookup) \
+{                                                                     \
+    if (contiguous) {                                                \
+        const npy_type* b = (const npy_type*)PyArray_DATA(a);        \
+        for (npy_intp i = 0; i < n; i++) {                           \
+            value_t v = (value_t)b[i];                               \
+            Py_hash_t hash = hash_func(v);                           \
+            FACTORIZE_RECORD(lookup_func(&scratch, v, hash, kat_lookup), hash); \
+        }                                                            \
+    }                                                                \
+    else {                                                           \
+        for (npy_intp i = 0; i < n; i++) {                           \
+            value_t v = (value_t)*(const npy_type*)PyArray_GETPTR1(a, i); \
+            Py_hash_t hash = hash_func(v);                           \
+            FACTORIZE_RECORD(lookup_func(&scratch, v, hash, kat_lookup), hash); \
         }                                                            \
-        Py_hash_t hash = double_to_hash(v);                         \
+    }                                                                \
+}                                                                    \
+
+# define FACTORIZE_INT(npy_type, kat_lookup) \
+    FACTORIZE_SCALAR_LOOP(npy_type, npy_int64, int_to_hash, lookup_hash_int, kat_lookup)
+
+# define FACTORIZE_UINT(npy_type, kat_lookup) \
+    FACTORIZE_SCALAR_LOOP(npy_type, npy_uint64, uint_to_hash, lookup_hash_uint, kat_lookup)
+
+// Per-element float body. All NaN collapse to one code and never enter the table
+// (the map compares with `==`, so NaN would otherwise never match itself);
+// +0.0/-0.0 and inf are handled correctly by the normal path.
+# define FACTORIZE_FLOAT_ELEM(value_expr, kat_lookup)                \
+{                                                                     \
+    npy_double v = (value_expr);                                     \
+    if (v != v) {                                                    \
+        if (nan_code < 0) {                                          \
+            nan_code = k;                                            \
+            first_index[k] = i;                                     \
+            codes[i] = k;                                           \
+            k++;                                                    \
+        }                                                           \
+        else {                                                      \
+            codes[i] = nan_code;                                    \
+        }                                                           \
+    }                                                               \
+    else {                                                          \
+        Py_hash_t hash = double_to_hash(v);                        \
         FACTORIZE_RECORD(lookup_hash_double(&scratch, v, hash, kat_lookup), hash); \
-    }                                                                 \
-}                                                                     \
-
+    }                                                               \
+}                                                                   \
+
+# define FACTORIZE_FLOAT(npy_type, kat_lookup, post_deref)           \
+{                                                                     \
+    if (contiguous) {                                               \
+        const npy_type* b = (const npy_type*)PyArray_DATA(a);       \
+        for (npy_intp i = 0; i < n; i++) {                          \
+            FACTORIZE_FLOAT_ELEM(post_deref(b[i]), kat_lookup)      \
+        }                                                           \
+    }                                                               \
+    else {                                                         \
+        for (npy_intp i = 0; i < n; i++) {                          \
+            FACTORIZE_FLOAT_ELEM(post_deref(*(const npy_type*)PyArray_GETPTR1(a, i)), kat_lookup) \
+        }                                                           \
+    }                                                               \
+}                                                                   \
+
+// Flexible (unicode/string) loop. In the contiguous case step a running pointer
+// by dt_size (incremental add, no per-element multiply), matching INSERT_FLEXIBLE.
 # define FACTORIZE_FLEXIBLE(char_type, lookup_func, hash_func, get_end_func, dt_size_expr) \
-{                                                                      \
-    Py_ssize_t dt_size = (dt_size_expr);                              \
-    for (npy_intp i = 0; i < n; i++) {                                \
-        char_type* v = (char_type*)PyArray_GETPTR1(a, i);            \
-        Py_ssize_t ksize = get_end_func(v, dt_size) - v;             \
-        Py_hash_t hash = hash_func(v, ksize);                        \
-        FACTORIZE_RECORD(lookup_func(&scratch, v, ksize, hash), hash); \
-    }                                                                 \
-}                                                                     \
+{                                                                     \
+    Py_ssize_t dt_size = (dt_size_expr);                            \
+    if (contiguous) {                                               \
+        char_type* v = (char_type*)PyArray_DATA(a);                 \
+        for (npy_intp i = 0; i < n; i++) {                          \
+            Py_ssize_t ksize = get_end_func(v, dt_size) - v;        \
+            Py_hash_t hash = hash_func(v, ksize);                   \
+            FACTORIZE_RECORD(lookup_func(&scratch, v, ksize, hash), hash); \
+            v += dt_size;                                           \
+        }                                                           \
+    }                                                               \
+    else {                                                         \
+        for (npy_intp i = 0; i < n; i++) {                          \
+            char_type* v = (char_type*)PyArray_GETPTR1(a, i);       \
+            Py_ssize_t ksize = get_end_func(v, dt_size) - v;        \
+            Py_hash_t hash = hash_func(v, ksize);                   \
+            FACTORIZE_RECORD(lookup_func(&scratch, v, ksize, hash), hash); \
+        }                                                           \
+    }                                                               \
+}                                                                   \
 
 // Hash-based factorize: return (uniques, codes) such that
 // array[i] == uniques[codes[i]], in O(n), reusing the AutoMap hash table.
@@ -2708,6 +2745,9 @@ factorize(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
         goto fail;
     }
 
+    // Enables the typed-pointer fast path in the scalar/flexible loops below.
+    int contiguous = PyArray_IS_C_CONTIGUOUS(a);
+
     switch (kat) {
         case KAT_INT8:   FACTORIZE_INT(npy_int8,   KAT_INT8);   break;
         case KAT_INT16:  FACTORIZE_INT(npy_int16,  KAT_INT16);  break;
@@ -2838,8 +2878,10 @@ factorize(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
 }
 
 # undef FACTORIZE_RECORD
+# undef FACTORIZE_SCALAR_LOOP
 # undef FACTORIZE_INT
 # undef FACTORIZE_UINT
+# undef FACTORIZE_FLOAT_ELEM
 # undef FACTORIZE_FLOAT
 # undef FACTORIZE_FLEXIBLE
 

From e89336784d12d330ffb0f08b8ddc463745459085 Mon Sep 17 00:00:00 2001
From: Christopher Ariza <ariza@flexatone.com>
Date: Fri, 3 Jul 2026 18:35:54 -0700
Subject: [PATCH 3/5] updated pointer usage

---
 src/auto_map.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/auto_map.c b/src/auto_map.c
index 3ff46566..ee1791b8 100644
--- a/src/auto_map.c
+++ b/src/auto_map.c
@@ -2576,10 +2576,14 @@ factorize_obj_is_float_nan(PyObject* key)
 {                                                                     \
     if (contiguous) {                                                \
         const npy_type* b = (const npy_type*)PyArray_DATA(a);        \
-        for (npy_intp i = 0; i < n; i++) {                           \
-            value_t v = (value_t)b[i];                               \
+        const npy_type* b_end = b + n;                               \
+        npy_intp i = 0;                                              \
+        while (b < b_end) {                                          \
+            value_t v = (value_t)*b;                                 \
             Py_hash_t hash = hash_func(v);                           \
             FACTORIZE_RECORD(lookup_func(&scratch, v, hash, kat_lookup), hash); \
+            b++;                                                     \
+            i++;                                                     \
         }                                                            \
     }                                                                \
     else {                                                           \
@@ -2624,8 +2628,12 @@ factorize_obj_is_float_nan(PyObject* key)
 {                                                                     \
     if (contiguous) {                                               \
         const npy_type* b = (const npy_type*)PyArray_DATA(a);       \
-        for (npy_intp i = 0; i < n; i++) {                          \
-            FACTORIZE_FLOAT_ELEM(post_deref(b[i]), kat_lookup)      \
+        const npy_type* b_end = b + n;                              \
+        npy_intp i = 0;                                             \
+        while (b < b_end) {                                         \
+            FACTORIZE_FLOAT_ELEM(post_deref(*b), kat_lookup)        \
+            b++;                                                    \
+            i++;                                                    \
         }                                                           \
     }                                                               \
     else {                                                         \

From 5c7361530b757cba7b7f9aaf5ba966216a2a862b Mon Sep 17 00:00:00 2001
From: Christopher Ariza <ariza@flexatone.com>
Date: Fri, 3 Jul 2026 20:33:27 -0700
Subject: [PATCH 4/5] enhancements

---
 src/auto_map.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/auto_map.c b/src/auto_map.c
index ee1791b8..64d1299d 100644
--- a/src/auto_map.c
+++ b/src/auto_map.c
@@ -2523,6 +2523,31 @@ fam_init(PyObject *self, PyObject *args, PyObject *kwargs)
 //------------------------------------------------------------------------------
 // factorize
 
+// A fast float hash for factorize only. Unlike double_to_hash (CPython-
+// compatible, so hash(1.0)==hash(1), via frexp + a loop), factorize never needs
+// cross-type hashing -- it only compares floats to floats from the same array by
+// `==`. So we canonicalize -0.0 to +0.0 (they compare equal, must hash equal),
+// reinterpret the bits, and apply a splitmix64 finalizer for good avalanche.
+// NaN never reaches here (handled before probing); +/-inf hash distinctly, which
+// is correct since they are distinct values. lookup_hash_double takes the hash as
+// a parameter, so this stays self-consistent within factorize's scratch table.
+static inline Py_hash_t
+factorize_double_to_hash(double v)
+{
+    if (v == 0.0) {
+        v = 0.0; // collapse -0.0 to +0.0
+    }
+    npy_uint64 x;
+    memcpy(&x, &v, sizeof(x));
+    x ^= x >> 33;
+    x *= 0xff51afd7ed558ccdULL;
+    x ^= x >> 33;
+    x *= 0xc4ceb9fe1a85ec53ULL;
+    x ^= x >> 33;
+    Py_hash_t h = (Py_hash_t)x;
+    return h == -1 ? -2 : h; // -1 marks an empty slot
+}
+
 // Detect a Python-level float NaN (a Python float or a NumPy floating scalar) so
 // that object-dtype arrays collapse all NaN into a single code, matching the
 // float-dtype behavior. Only real floats are treated as NaN here; None, NaT, and
@@ -2619,7 +2644,7 @@ factorize_obj_is_float_nan(PyObject* key)
         }                                                           \
     }                                                               \
     else {                                                          \
-        Py_hash_t hash = double_to_hash(v);                        \
+        Py_hash_t hash = factorize_double_to_hash(v);              \
         FACTORIZE_RECORD(lookup_hash_double(&scratch, v, hash, kat_lookup), hash); \
     }                                                               \
 }                                                                   \

From aef055636cf5aa8e1cb1970bc9e0088c5435366c Mon Sep 17 00:00:00 2001
From: Christopher Ariza <ariza@flexatone.com>
Date: Fri, 3 Jul 2026 21:31:00 -0700
Subject: [PATCH 5/5] format

---
 src/auto_map.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/auto_map.c b/src/auto_map.c
index 64d1299d..4dbe6b3a 100644
--- a/src/auto_map.c
+++ b/src/auto_map.c
@@ -2918,6 +2918,7 @@ factorize(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
 # undef FACTORIZE_FLOAT
 # undef FACTORIZE_FLEXIBLE
 
+//------------------------------------------------------------------------------
 
 static PyObject *
 fam_repr(FAMObject *self)