Idx File [work] -
| Operation | Python (struct+numpy) | C (libidx) | NumPy .npy | HDF5 | |-----------|----------------------|------------|--------------|------| | Load 60k images | 0.24 sec | 0.09 sec | 0.19 sec | 0.31 sec | | Memory mapping | N/A | 0.001 sec | 0.001 sec | 0.15 sec | | Random access (per image) | 2.1 µs | 0.4 µs | 1.2 µs | 8.5 µs |
int idx_read(const char *filename, idx_file_t *out) Benchmark results (average of 10 runs, reading 60k MNIST images): idx file
# Parse magic: first two bytes must be 0 if magic[0] != 0 or magic[1] != 0: raise ValueError("Invalid IDX file: magic prefix missing") data_type_code = magic[2] dim_count = magic[3] # Data type mapping dtypes = 0x08: 'B', # unsigned char 0x09: 'b', # signed char 0x0B: 'h', # short 0x0C: 'i', # int 0x0D: 'f', # float 0x0E: 'd' # double if data_type_code not in dtypes: raise ValueError(f"Unsupported data type code: data_type_code") # Read dimension sizes dims = [] for _ in range(dim_count): dim = struct.unpack('>I', f.read(4))[0] dims.append(dim) # Calculate total elements total_elements = 1 for d in dims: total_elements *= d # Determine numpy dtype np_dtype = 0x08: np.uint8, 0x09: np.int8, 0x0B: np.int16, 0x0C: np.int32, 0x0D: np.float32, 0x0E: np.float64 [data_type_code] # Read data data = np.fromfile(f, dtype=np_dtype, count=total_elements) # Reshape and return return data.reshape(dims) def write_idx(filename, data_array): """Write a numpy array to IDX format.""" # Determine data type code dtype_map = np.uint8: 0x08, np.int8: 0x09, np.int16: 0x0B, np.int32: 0x0C, np.float32: 0x0D, np.float64: 0x0E if data_array.dtype not in dtype_map: raise ValueError(f"Unsupported dtype: data_array.dtype") data_type_code = dtype_map[data_array.dtype] dim_count = len(data_array.shape) | Operation | Python (struct+numpy) | C (libidx) | NumPy