/*
  From: Andre Reinald
  To: submissive

  I though you might be intersted by some modifications I brought to your
  code in order to improve legibility and slightly speed.

  Well, my source is commented and I often wrote 4 lines where you had
  only one, but the generated code is smaller and it gives more
  opportunity to check intermediate steps while debugging.

  By the way, I did so on a PowerMac with CodeWarrior compiler, so your
  source is really portable.
*/

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

// complicated expression better fits as macro (or inline in C++)
#define ByteOf(x) (((x) >> bitsOffset) & 0xff)

typedef unsigned long ulong;

// replaced byte with bitsOffset to avoid *8 operation in loop
static void radix (short bitsOffset, ulong N, ulong *source, ulong *dest)
{
	// suppressed the need for index as it is reported in count
	ulong count[256];
	// added temp variables to simplify writing, understanding and compiler
	// optimization job most of them will be allocated as registers
	ulong *cp, *sp, s, c, i;

	// faster than MemSet
	cp = count;
	for (i = 256; i > 0; --i, ++cp)
		*cp = 0;

	// count occurences of every byte value
	sp = source;
	for (i = N; i > 0; --i, ++sp) {
		cp = count + ByteOf (*sp);
		++(*cp);
	}

	// transform count into index by summing elements and storing into same array
	s = 0;
	cp = count;
	for (i = 256; i > 0; --i, ++cp) {
		c = *cp;
		*cp = s;
		s += c;
	}

	// fill dest with the right values in the right place
	sp = source;
	for (i = N; i > 0; --i, ++sp) {
		s = *sp;
		cp = count + ByteOf (s);
		dest[*cp] = s;
		++(*cp);
	}
}

static void radix_sort (ulong *source, ulong N)
{
	// allocate heap memory to avoid the need of additional parameter
	ulong *temp = malloc (N * sizeof (ulong));
	assert (temp != NULL);

	radix (0, N, source, temp);
	radix (8, N, temp, source);
	radix (16, N, source, temp);
	radix (24, N, temp, source);

	free (temp);
}

static void make_random (ulong *data, ulong N)
{
	for ( ; N > 0; --N, ++data)
		*data = rand () | (rand () << 16);
}

static void check_order (ulong *data, ulong N)
{
	// only signal errors if any (should not be)
	for (--N ; N > 0; --N, ++data)
		assert (data[0] <= data[1]);
}

// test for big number of elements
static void test_radix (ulong N)
{
	unsigned i;
        ulong *data = malloc (N * sizeof (ulong));
	assert (data != NULL);

	make_random (data, N);
	radix_sort (data, N);
	check_order (data, N);

	free (data);
}

int main (void)
{
	test_radix (500000);
	return 0;
}

