/* $Id: snazzyheap.cpp 1109 2005-12-01 22:47:31Z tco $ */


#include <stdlib.h>
#include <memory.h>
#include <string.h>
#include <malloc.h>
#include <stdio.h>
#include "header.h"
#include "list.h"


#if !defined(_SOS_WINDOWS) && !defined(_WINDOWS) && !defined(__BORLANDC__)
#define UNIX
#endif


/* Tim's array.cpp and SnazzyHeap.cpp in one file.                */
/* TO USE:      Simply compile and link with this .o file.      */
/*              On UNIX, compile with UNIX defined (-DUNIX)     */
/*                                                              */
/* WHY USE:     1. It's much faster                             */
/*              2. It uses less memory                          */
/*              3. It is thread-safe                            */
/*              4. It has various features to aid debugging:    */
/*                 - MallocCheck() : a fast and comprehensive   */
/*                   consistency check to look for memory       */
/*                   corruption                                 */
/*                 - MallocLeakReport()                         */
/*                 - MallocMemUsed() / MallocMemTotal()         */
/*                 - If you free something that's already free, */
/*                   you get an error. This feature is always   */
/*                   turned on                                  */
/*                 - Filling tiles with garbage (0xf5ee)        */
/*                   when freed.                                */
/*                                                                */
/* It's faster because it doesn't waste time coalescing blocks  */
/* only to break them up again;  and it achieves less                 */
/* fragmentation by using a compromise on "powers of 2"; it        */
/* uses only 4 bytes for block headers; and the "1.5*2^n" rule        */
/* makes it work well with realloc()'s (for sets&lists).        */
/*                                                                */
/* If you want to implement efficient sets&lists, you should    */
/* let 'realloc' take care of rounding the size up to certain   */
/* special sizes.                                                */
/*                                                                */
/* LIMITATIONS:        1. On HP processors, if you use multi-threaded        */
/* code, then someone needs to implement locking using semget/  */
/* et al.                                                       */
/*                                                              */
/* 2.  http://www.microquill.com/kb/faq_ans.htm#lrgblocks :     */
/* "GDI functions may faile when operating on a large block     */
/* allocated by [nonstandard heap], when that block straddles   */
/* two adjacent VirtualAlloc ranges."   When allocating memory  */
/* for Win32 GDI functions, such as bitmaps, use GlobalAlloc(), */
/* _or_ we can make SnazzyHeap work around this BUG in GDI      */
/* by preventing SnazzyHeap from coalescing adjacent wholesale  */
/* memory ranges.  But this also makes it slightly less         */
/* efficient. */




void MallocCheck();
    /* Checks the heap for consistency.  If there's an error, it calls */
    /* 'assert_failed()'. */

void MallocLeakReport(char *filename);
    /* Print a report on memory usage.  filename==NULL means stdout. */

int MallocMemUsed();
    /* How much memory is used? */

int MallocMemTotal();
    /* How much memory is allocated by the operating system? */
    /* = MallocMemUsed() + free tiles */

void MallocSetHeap(int n);
    /* Instead of using the main heap, use an alternative one for all */
    /* subsequent allocs/deallocs. Later you can return to the main   */
    /* heap using MallocSetHeap(0).  Important: don't try to free()   */
    /* something allocated in another heap! */

void MallocFreeHeap();
    /* Free everything in the current heap. */

void assert_failed(kstr filename, int lineno, kstr condition);





#ifndef yes
#define yes                        true
#define no                        false
#define clearA(A)                memset(A,0,sizeof(A))
#endif

#ifdef ALIGN64
#define HEAP_NUMPOWERS                54
#else
#define HEAP_NUMPOWERS                56
#endif
#define MAX_HEAP_SIZE                (4<<(HEAP_NUMPOWERS/2))
#define SMALLTILE_MAXIMUM        98304
#define tilenext(tile)                *(char**)((char*)tile+4)
#define ROUNDUPTOPAGE(n)        (((n-1)|65535)+1)
    // Windows VirtualAlloc() has a granularity of 64K


/*
    I use the word 'tile' to mean a malloc block.
    A 'tile_type' is a ptr to the start of the
    tile including the header, i.e. this is NOT
    the pointer reurned to the user.
    For the pointers returned to the user I always use 'void*'.
*/
typedef char* tile_type;
typedef char* str;
typedef unsigned int uint;


static char* RawRegion(int numbytes);
static void FreeRawRegion(void* mem, int numbytes);
    /* Allocate & deallocate large memory blocks from the OS */


class Heap {

    /* WholesaleRegion: interfacing with virtual memory */
    struct WholesaleRegion {     // A contiguous set of pages
        #ifdef ALIGN64
        uint dummy;
        uint numbytes;                            // Subtract this from region
        union {
            WholesaleRegion *next;                //
            char dummy2[8];
        };
        #else
        uint numbytes;                            // Subtract this from region
        WholesaleRegion *next;                //
        #endif
        tile_type start() { return (tile_type)&numbytes - numbytes; }
        tile_type end() { return (tile_type)&numbytes; }
    };

    struct BigTileHeader {
        BigTileHeader *prev, *next;
        int signature;
        #define BIGBIGBIG        0xb16b16
        int header4;
    };

    WholesaleRegion *regions;                // Ideally there would only
    // ever be one WholesaleRegion, but this is not always possible.
    BigTileHeader *big_root;
    int NumFreedSinceMerge, NumUsed;        // Statistics for merging
    static uint size2power[512];
    static uint power2size[HEAP_NUMPOWERS];
    tile_type freechain[HEAP_NUMPOWERS];
    volatile long Lock;                     // For thread-safe mallocs

    void FreeTile(char* p, uint size, uint good_h=0);
    int MergeTiles();
    bool AddNewRegion(uint numbytes);
    void WaitForLock();
    void Unlock();
    void* BigMalloc(size_t size);
    void BigFree(void *ptr);

    static uint CalcSize2Power(uint size);
    static uint Size2Power(uint size);
    static uint FreeTileToSize(tile_type tile);
    static uint AllocdTileToSize(tile_type tile);
    static uint TileSize(tile_type tile);
    static bool IsFree(tile_type tile) { return (*(int*)tile & 1); }


public:
    void* malloc(size_t size);
    void* realloc(void* p, size_t size);
    void* calloc(size_t size, size_t n);
    void free(void* p);
        /* The normal malloc functions. */

    static bool AssertTile(tile_type tile);
    bool Assert(void);
        /* For checking for corruption in the heap. */

    void Report(FILE *output);
        /* Prepare a report on memory usage. */

    int MemUsed();
        /* Total memory in used tiles */

    int MemTotal(void);
        /* Total memory allocated by the operating system */

    void FreeEverything();
        /* Free all tiles at once. */

    void Initialise(uint size);
        /* A constructor.  Don't use the normal constructor syntax */
        /* because this leads to confusion & circularities. */

    friend class HeapIterator;
    friend uint snazzyMsize (void* ptr);
    friend void MallocSetHeap(int n);
	friend void MallocOptimise();
};


uint Heap::size2power[512];
uint Heap::power2size[HEAP_NUMPOWERS];
Heap* MainHeap, AlternateHeap[10];








/*---------------------- Heaps: malloc & free -----------------------*/


/* 
The representation for small tiles is as follows:
-------------------------------------------------

* There is a 4-byte header for all tiles. Ignoring bits 0 & 1,
this header specifies the size of the tile (including the header
itself).

* Free tiles are denoted by this header bit0=1.

* Allocated tiles have header bit0=0.

* Free blocks have bit0=1, and to get the size of the block
just ignore this bit.

* All 'sizes' include the 4 bytes of the header.

* There are only certain allowable sizes of tile.  A tile
can only have a size which is either a power of 2 or 150%
of a power of 2, starting from 8, i.e. 8, 12, 16, 24, 32,
48, 64, 96, 128, 192, ... . Tiles are always rounded up to
one of these sizes.  If the caller requests e.g. 25 bytes,
then we allocate 32 bytes and 7 bytes are wasted (we don't
attempt to set it up as a free block).

* In addition, you can have free-tiles of size 4 bytes (i.e.
only a header).  However you can't have allocated tiles of
this size. 4-byte free blocks are too small to worry about,
so we just let them lie there wasting space until being
reclaimed by the MergeTiles() function.

* All free blocks (except for these 4-byters) have, in addition
to the 4-byte header, a 4-byte pointer which forms a link to
the next free block of the same size, with the exception of
free blocks of size 4 bytes (which don't have any space for
anything other than the header).



Big tiles work as follows:
--------------------------
    Any tile whose outer size > SMALLTILE_MAXIMUM is regarded as
a 'big tile' and is allocated using a special system. This system
involves allocating each big tile in its very own mmap segment.
Big tiles also have a header, which has the same format as 
above, but also before this 4-byte header are 2 pointers. The
2 pointers form a doubly linked list.  We need this doubly-
linked list in order to implement Heap::Free().
    When a big tile is freed, it is returned immediately to
the OS.
    We allow free tiles of size > SMALLTILE_MAXIMUM, but never
used tiles.
    We treat big tiles as a special case because (a) we get
no benefit from the "power of 2 or 1.5 times power of 2" rule,
and (b) we reduce physical memory consumption by giving these
pages back to the OS.

*/



uint Heap::TileSize(tile_type tile)
/* What is the outer size of this block?  Outer size means the rounded up */
/* size including the header. */
{   uint i = *(uint*)tile;

    if (i == (unsigned int)-1)
#ifdef ALIGN64
        return 8;
#else
        return 4;
#endif
    return i & ~3;
}


uint Heap::AllocdTileToSize(tile_type tile)
/* Returns the size in bytes of this tile, including the header. */
{
    return *((uint*)tile) & ~2;
}


uint Heap::FreeTileToSize(tile_type tile)
{
    if (((int*)tile)[0] == -1)
#ifdef ALIGN64
        return 8;
#else
        return 4;
#endif
    else return ((int*)tile)[0] & ~1;
}


uint Heap::CalcSize2Power(uint size)
{   uint h;

    for (h=0; size > power2size[h]; h++)
        ;
    return h;
}


uint Heap::Size2Power(uint size)
{
    if (size < 512)
        return size2power[size];
    else return CalcSize2Power(size);
}


static int SuitableBreak(void* p)
/* We try to optimise the application's use of the Intel chips' 128-byte */
/* primary cache lines.  How suitable is 'p' as a place to break two tiles? */
{   int i = (long)p & 127;

#ifdef ALIGN64
    p = (char*)p + 4;
#endif
    if (i == 0)
        return 9;
    else if ((i & 63) == 0)
        return 8;
    else if ((i & 31) == 0)
        return 7;
    else if ((i & 15) == 0)
        return 6;
    else if ((i & 7) == 0)
        return 5;
    else return 4;
}
    

void Heap::FreeTile(char* p, uint size, uint good_h)
/* We have some free memory which might not have one of the */
/* allowable sizes.  Break it up into pieces if necessary   */
/* and free it.  Insofar as we have choice in the way we    */
/* break it up, try to break it into multiples of 'good_h's */
/* size (this is a hint as to likely next size to be            */
/* allocated). */
{   uint h, hsz;
    char* p2;

    if (size > MAX_HEAP_SIZE) {
        assert(false);
        return;
    }
#ifdef ALIGN64
    assert((size & 7) == 0);
#endif

    /* Try to make it a multiple of 'good_h': */
    if (good_h && power2size[good_h] <= size) {
        h = good_h;
        while (power2size[h+2] <= size)
                h += 2;
        goto HAVE_H;
    }

    /* Just use the standard algorithm: cut it into the biggest */
    /* possible block with the smallest possible remainder, and */
    /* repeat. */
#ifdef ALIGN64
    while (size > 8) {
#else
    while (size > 4) {
#endif
        h = Size2Power(size);
        if (power2size[h] > size)
            h--;
        HAVE_H:
        hsz = power2size[h];
        if (SuitableBreak(p+hsz) > SuitableBreak(p+size-hsz))
            p2 = p + hsz;
        else
            p2 = p, p = p + size - hsz;
        *((int*)p) = hsz | 1;
        *(void**)(p+4) = freechain[h];
        freechain[h] = p;
        p = p2;
        size -= hsz;
    }

    /* Free tiles of size 4 are not put into any free chain; */
    /* they are simply wasted (until the next MergeTiles).   */
    if (size)
        *(int*)p = -1;

    /* I did try to coalesce 2 blocks of size 4 until I realised */
    /* that this would often cause page faults as we write             */
    /* outside the allocated region. */
}


int Heap::MergeTiles()
/* Reclaim space lost to fragmented memory by merging */
/* adjacent free blocks.  Return the number of tiles  */
/* merged. */
{   WholesaleRegion *region;
    tile_type p, q;
    char* end;
    int n=0;

    memset(freechain, 0, sizeof(freechain));
    for (region=regions; region; region=region->next) {
        p = region->start();
        end = region->end();
        while (p < end) {
            if (IsFree(p)) {
                q = p;
                do {
                    q += FreeTileToSize(q);
                    n++;
                } while (q < end && IsFree(q));
                FreeTile(p, (uint)(q-p));
                p = q;
            }
            else {
                p += AllocdTileToSize(p);
            }
        }
    }
    return n;
}


void* Heap::malloc(size_t size)
{   uint i, h, header;
    uint *p;

    WaitForLock();                  // To make it thread-safe

    /* Big tiles are a special case: */
    if (size > SMALLTILE_MAXIMUM-4) {
        void* ptr = BigMalloc(size);
        Unlock();
        return ptr;
    }

    size += 4;                        // The size should include the header.
    h = Size2Power(size);                // What power are we in?
    size = power2size[h];                // Round the size up to this power.
    header = size|2;
    NumUsed++;

    RETRY:

    /* Case I: do we have a block of exactly the right size? */
    p = (uint*)freechain[h];
    if (p) {
        freechain[h] = tilenext(p);
        *p = header;
        Unlock();
        return (tile_type)(p + 1);        // Mark the block as used and return it.
    }

    /* Case II: break up a larger free block */
    if (h+2 < HEAP_NUMPOWERS) {
        i = h+2;
            if ((p=(uint*)freechain[i]) != NULL)
                goto FOUND_LARGER;
            /* The next best thing is to halve a block of                 */
            /* double the size.  That way we'll have a spare block        */
            /* of the same size which is likely to be needed.        */

        i = h;
        while (++i < HEAP_NUMPOWERS) {   // Break up other blocks.
            p = (uint*)freechain[i];
            if (p) {
                FOUND_LARGER:
                char *p2;
                freechain[i] = tilenext(p);
                if (SuitableBreak(p + size) >
                                SuitableBreak(p + power2size[i] - size))
                    p2 = (char*)p + size;
                else {
                    p2 = (char*)p;
                    p = (uint*)((char*)p + power2size[i] - size);
                }
                *p = header;
                FreeTile(p2, power2size[i] - size, h);
                Unlock();
                return (tile_type)(p + 1);
            }
        }
    }

    /* Case III: Consolidate adjoining tiles and try again */
    if (NumFreedSinceMerge * 20 > NumUsed) {
        NumFreedSinceMerge = 0;
        if (MergeTiles() > 100)                // at least 100 tiles merged
            goto RETRY;
    }

    /* Case IV: Request more memory from the OS */
    uint bucket;
    bucket = 65536;
    if (size >= bucket)
        bucket = ROUNDUPTOPAGE(size);
    assert(bucket <= MAX_HEAP_SIZE);
    if (! AddNewRegion(bucket))
        return NULL;       // Actually, AddNewRegion should throw bad_alloc.
    goto RETRY;
}


void Heap::free(void* p)
{   tile_type tile;
    uint size, h;

    if (p == NULL)
        return;
    tile = (char*)p - 4;
    if (IsFree(tile)) {
        assert(false);
        return;
    }
    WaitForLock();
    size = AllocdTileToSize(tile);
    if (size > SMALLTILE_MAXIMUM) {
        BigFree(p);
        Unlock();
        return;
    }
    h = Size2Power(size);
    *((int*)tile) = size | 1;
    tilenext(tile) = freechain[h];
    freechain[h] = tile;
    NumFreedSinceMerge++;
    NumUsed--;
    Unlock();

#if 1
//Does this still crash on HP-UX in Oslo, in their click2smarts program?
    /* OPTIONAL:  Put garbage into the tile as an aid to */
    /* detecting memory corruption errors: */
    tile += 4+sizeof(void*);
    int *pi=(int*)tile;
    size -= sizeof(void*);
    size >>= 2;
    while (--size > 0) {
        *pi++ = 0xf5eef5ee;
    }
#endif
}


void* Heap::realloc(void* oldv, size_t newsize)
{   uint oldsize;
    void *newv;

    if (oldv == NULL)
        return malloc(newsize);
    if (IsFree((char*)oldv-4)) {
        assert(false);
        return NULL;
    }
    oldsize = AllocdTileToSize((char*)oldv-4);

    /* What 'AllocdSize' would we get if we were to malloc(newsize)? */
    if (newsize > SMALLTILE_MAXIMUM-4) {
        newsize = power2size[Size2Power(newsize+sizeof(BigTileHeader))];
            // Round it up, in case we're going to expand it further.
        newsize = ((newsize-1)|4095)+1 - sizeof(BigTileHeader) + 4;        
            // Ensure we know exactly how many bytes we're going to get.
    }
    else newsize = power2size[Size2Power(newsize+4)];

    /* Expanding, shrinking or staying the same? */
    if (oldsize == newsize)
        return oldv;
    if (oldsize > newsize) {
        /* Method 1: minimises fragmentation */
        newv = memcpy(malloc(newsize-4), oldv, newsize-4);
        assert(AllocdTileToSize((char*)newv-4) == newsize);
        free(oldv);
        return newv;

        /* Method 2: minimises memcpy()'s.
        ((int*)oldv)[-1] = newsize | 2;
        WaitForLock();
        FreeTile((char*)oldv - 4 + newsize, oldsize - newsize);
        NumFreedSinceMerge++;
        Unlock();
        return oldv;
        */
        /* I prefer method 1 because the time taken for memcpy's
        seems to be negligible. */
    }
    else if (oldsize == 0) {
        return malloc(newsize-4);
    }
    else {
        newv = memcpy(malloc(newsize-4), oldv, oldsize-4);
        free(oldv);
        return newv;
    }
}


void* Heap::calloc(size_t a, size_t b)
{
    a *= b;
    return memset(this->malloc(a), 0, a);
}


void* Heap::BigMalloc(size_t size)
{   BigTileHeader *big;

    size += sizeof(BigTileHeader);
    size = ((size-1)|4095)+1;        // Round up to multiple of 4K
    big = (BigTileHeader*)RawRegion(size);
	if (big == NULL)
        big = (BigTileHeader*)RawRegion(size);
    big->prev = NULL;
    big->next = big_root;
    big->signature = BIGBIGBIG;
    big->header4 = size - sizeof(BigTileHeader) + 4;
    if (big_root)
        big_root->prev = big;
    big_root = big;
    return big + 1;
}


void Heap::BigFree(void *ptr)
{   BigTileHeader *big=(BigTileHeader*)ptr - 1;

    if (big->signature != BIGBIGBIG) {
        assert(false);
        return;
    }

    /* Unlink it from the linked list: */
    if (big->next)        // Unlink from the linked list
        big->next->prev = big->prev;
    if (big->prev)
        big->prev->next = big->next;
    else {
        assert(big_root == big);
        big_root = big->next;
    }

    /* Release it back to the operating system: */
    FreeRawRegion(big, big->header4);
}


bool Heap::AddNewRegion(uint numbytes)
/* Create a new WholesaleRegion for this heap. Initialise it for */
/* immediate use with malloc/free. */
{   WholesaleRegion *region, **regionp;
    char *mem, *tmp;

    /* Get the raw memory: */
    assert((numbytes & 4095) == 0);
    mem = RawRegion(numbytes);

    /* Now update regions: */
    for (regionp=&regions; (region=*regionp) != NULL; 
                            regionp=&region->next) {
        if (mem == (char*)(region+1)) {
            /* Good: we can coalesce with this region. */
            tmp = region->end();
            region = (WholesaleRegion*)(mem + numbytes) - 1;
            region->numbytes = (*regionp)->numbytes + numbytes;
            region->next = (*regionp)->next;
            FreeTile(tmp, numbytes);
            *regionp = region;
            return yes;
        }
        else if ((char*)region > mem)
            break;
    }
    region = (WholesaleRegion*)(mem + numbytes) - 1;
#ifdef ALIGN64
    region->numbytes = numbytes - 16;
    mem += 4;
#else
    region->numbytes = numbytes - sizeof(WholesaleRegion);
#endif
    region->next = *regionp;
    *regionp = region;
    FreeTile(mem, region->numbytes);
    return yes;
}


void Heap::FreeEverything()
/* Return all memory to the virtual memory manager. */
{   WholesaleRegion *region, *rnext;
    BigTileHeader *tmp;

    NumFreedSinceMerge = NumUsed = 0;
    clearA(freechain);
    for (region=regions; region; region=rnext) {
        rnext = region->next;
        FreeRawRegion((char*)region - region->numbytes, 
                        region->numbytes + sizeof(WholesaleRegion));
    }
    regions = NULL;
    while (big_root) {
        tmp = big_root;
        big_root = tmp->next;
        FreeRawRegion(tmp, tmp->header4);
    }
}


void Heap::Initialise(uint size)
{   uint b,c,i;

    /* Initialise the static variables. */
    if (power2size[0] == 0) {
        for (c=0; c < HEAP_NUMPOWERS; c++) {
            b = c & 1;
            i = c >> 1;
#ifdef ALIGN64
            power2size[c] = b ? (24<<i) : (16<<i);
#else
            power2size[c] = b ? (12<<i) : (8<<i);
            assert(sizeof(char*) == 4);
#endif
        }
        for (i=1; i < 512; i++)
            size2power[i] = CalcSize2Power(i);
        size2power[0] = 0;
    }

    /* Initialise this heap: */
    clearA(freechain);
    regions = NULL;
    big_root = NULL;
    NumFreedSinceMerge = NumUsed = 0;
    Lock = 0;
    MainHeap = this;
    AddNewRegion(size);
}








/*----------------- Platform-specific stuff: --------------*/

#ifdef UNIX
#include <unistd.h>
#include <sys/mman.h>


static char* RawRegion(int numbytes)
{   char *mem;

    mem = (char*)mmap(NULL, numbytes,
                    PROT_READ|PROT_WRITE,
                    MAP_PRIVATE|MAP_ANON,0,0);
    if (mem == MAP_FAILED) {
        printf("Out of memory.\n");
        exit(4);
        return NULL;
    }
    else return mem;
}


static void FreeRawRegion(void* mem, int numbytes)
{
    munmap(mem, numbytes);
}


void Heap::WaitForLock()
{
    do {
        Lock++;
        if (Lock == 1)
            return;
        Lock--;
        usleep(10000/*uS*/);
        /* This is guaranteed to work if:
        1. The compiler implements the 'volatile' keyword correctly
        2. The compiler compiles 'Lock++' into:  INC [mem]
        3. You're not using multi-threading on a dual-CPU machine.

        These conditions are all met on Smarts Unix servers.
        */
    } while (1);
}


void Heap::Unlock()
{
    Lock--;
}


#else

#undef interface
#include <windows.h>
#include <new>


static char* RawRegion(int numbytes)
{   void *mem;

    mem = VirtualAlloc(NULL, numbytes,
                            MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE);
    if (mem == NULL)
        throw std::bad_alloc();
    return (char*)mem;
}


static void FreeRawRegion(void* mem, int numbytes)
{
    VirtualFree(mem, 0, MEM_RELEASE);
}


void Heap::WaitForLock()
{
    while (InterlockedIncrement((long*)&Lock) != 1) {
        InterlockedDecrement((long*)&Lock);
        Sleep(10);
    }
}


void Heap::Unlock()
{
    InterlockedDecrement((long*)&Lock);
}

#endif





/*-------------- Optional stuff: --------------*/
/* Memory corruption checking, memory leak detection, multiple heaps, */
/* memory usage statistics. */


class HeapIterator {
    Heap::WholesaleRegion *region;
    char* end;
    tile_type tile;
    class Heap* heap;

public:
    HeapIterator(class Heap* heap);
    tile_type operator++(int);
    void reset();
};

#define each_tile        HeapIterator iter(MainHeap); (tile = iter++) != NULL;



void HeapIterator::reset()
{
    tile = NULL;
    region = heap->regions;
}


HeapIterator::HeapIterator(Heap* _heap)
{
    heap = _heap;
    reset();
}


tile_type HeapIterator::operator++(int dummy)
/* Coming in here, 'tile' points to some unprocessed, possibly free tile. */
/* Return 'tile' (if allocated, otherwise the next allocated tile). */
{
    if (tile == NULL)
        goto NEXT_SEGMENT;
    tile += Heap::TileSize(tile);
    do {
        if (tile >= end) {
            NEXT_SEGMENT:
            region = region->next;
            if (region == NULL) {
                tile = NULL;
                return NULL;
            }
            tile = region->start();
            end = region->end();
        }
        else return tile;
    } while (1);
}


static void insitusort(void **Q, int n)
/* A quicksort algorithm that doesn't need to allocate memory. */
{   void *pivot, *p;
    int a,b,c;

    if (n <= 1)
        return;
    pivot = Q[n/2];
    a = b = 0;
    c = n-1;
    while (b <= c) {
        if (Q[b] == pivot)
            b++;
        else if (Q[b] < pivot) {
            if (a < b)
                p=Q[a], Q[a]=Q[b], Q[b]=p;
            a++;
            b++;
        }
        else {
            p=Q[c], Q[c]=Q[b], Q[b]=p;
            c--;
        }
    }
    insitusort(Q, a);
    insitusort(Q+c, n-c);
}


bool Heap::AssertTile(tile_type tile)
/* Check that this block is allocated and Ok in every respect. */
{
    if (*(uint*)tile & 1)
        return no;          // It's a free tile.
    return *((uint*)tile) > 0 && *(uint*)tile < 1<<26;
}


bool Heap::Assert(void)
/* A detailed validation of the heap, to search for various types of    */
/* corruption.  If it finds an error, it will assert(false) and return  */
/* 'no'. Occasionally, if the heap is corrupt it will crash in here     */
/* instead. */
{   struct {
        tile_type *list;
        int a_idx;                  // allocated size
        int l_idx;                  // real size of list
        int i;                      // Where are we up to?
    } sorted[HEAP_NUMPOWERS];
    uint size, h, specialsize;
    WholesaleRegion *region;
    tile_type tile, t;
    char* special, *a;
    int NumFreeTiles;
    char *end;
    int i,n;

    WaitForLock();
    /*if (MemTotal() > 1024*1024)
        return yes;*/

    /* Build a _sorted_ list of free tiles from the free-list: */
    /* Note that it is unfortunate that we have to do memory   */
    /* allocation inside the memory check algorithm, but it's  */
    /* unavoidable if we don't want an O(n^2) algorithm.  In   */
    /* order to avoid interfering with the heap itself, we     */
    /* allocate this memory using a special low-level call.           */
    NumFreeTiles = 0;
    for (h=0; h < HEAP_NUMPOWERS; h++) {
        n = 0;
        for (t=(char*)freechain[h]; t; t=tilenext(t)) {
            if (++n > 1<<26) {
                assert(false);
                return no;         // We must be in a cycle.
            }
        }
        NumFreeTiles += n;
        sorted[h].a_idx = n;
    }
    specialsize = ROUNDUPTOPAGE(NumFreeTiles*sizeof(tile_type));
    a = special = RawRegion(specialsize);
    for (h=0; h < HEAP_NUMPOWERS; h++) {
        sorted[h].list = (tile_type*)a;
        a += sorted[h].a_idx * sizeof(tile_type);
        sorted[h].i = 0;
        i = 0;
        for (t=(char*)freechain[h]; t; t=tilenext(t)) {
            assert(i < sorted[h].a_idx);
            sorted[h].list[i++] = t;
        }
        insitusort((void**)sorted[h].list, i);
        for (int j=1; j < i; j++)
            assert(sorted[h].list[j-1] < sorted[h].list[j]);
        sorted[h].l_idx = i;
    }

    /* Walk through the heap: */
    for (region=regions; region; region=region->next) {
        tile = region->start();
        end = region->end();
        while (tile < end) {
            if (IsFree(tile)) {
                size = FreeTileToSize(tile);
#ifdef ALIGN64
                if (size == 8)
#else
                if (size == 4)
#endif
                    ;
                else {
                    h = Size2Power(size);
                    if (power2size[h] != size) {
                        assert(false);
                        return no;
                    }
                    if (tile != sorted[h].list[sorted[h].i]) {
                        if (tile < sorted[h].list[sorted[h].i]) {
                            // This free tile not found in the free-chain,
                            // or sorted[h].list[i] not found in the
                            // heap walk.
                            assert(false);
                            return no;
                        }
                        else {
                            // 'sorted[h].list[sorted[h].i]' is in the 
                            // free-chain but not in the heap walk.
                            assert(false);
                            return no;
                        }
                    }
                    if (++sorted[h].i > sorted[h].l_idx) {
                        assert(false);
                        return no;
                    }
                }
            }
            else {
                size = AllocdTileToSize(tile);
                if (! AssertTile(tile)) {
                    assert(false);
                    return no;
                }
            }
            tile += size;
        }
        assert(tile == end);
    }
    for (h=0; h < HEAP_NUMPOWERS; h++) {
        if (sorted[h].i != sorted[h].l_idx) {
            assert(false);
            return no;
            // The free-chain has more entries that we found in the heap.
        }
    }

    /* Check on the big tiles: */
    for (BigTileHeader *big=big_root; big; big=big->next) {
        if (big->prev)
            assert(big->prev->next == big);
        else assert(big == big_root);
        assert(big->signature == BIGBIGBIG);
    }

    /* Return our special memory region to the operating system: */
    FreeRawRegion(special,specialsize);
    Unlock();

    return yes;
}


void Heap::Report(FILE *output)
/* Prepare a report on memory usage. */
{   int count[2][HEAP_NUMPOWERS], h, total[2];
    tile_type tile;

    WaitForLock();
    clearA(count);
    for (each_tile) {
        h = Size2Power(TileSize(tile));
        count[IsFree(tile)][h]++;
    }
    Unlock();
    clearA(total);
    for (h=0; h < HEAP_NUMPOWERS; h++) {
        total[0] += count[0][h] * power2size[h];
        total[1] += count[1][h] * power2size[h];
    }
    fprintf(output, "%db allocated, %db free.  (%1.1f%% wastage)\n\n",
                    total[0], total[1],
                    total[1] * 100.0 / (total[0] + total[1]));
    for (h=0; h < HEAP_NUMPOWERS; h++) {
        if (count[no][h] + count[yes][h] > 0) {
            fprintf(output, "%7d bytes:  %7d used, %7d free = %dK\n",
                    power2size[h], count[no][h], count[yes][h],
                    (count[no][h] + count[yes][h]) * power2size[h]
                    / 1024);
        }
    }
    /* OPTIONAL:
        fprintf(output, "\n\nThese blocks are still allocated:\n");
        for (each_tile) {
            if (! IsFree(tile))
                fprintf(output, "0x%p : %d\n", tile+4, TileSize(tile));
        }
    }
    */
    fprintf(output, "\n\n");
}


int Heap::MemUsed()
{   tile_type tile;
    int totmemused=0;

    WaitForLock();
    for (each_tile) {
        if (! Heap::IsFree(tile))
            totmemused += TileSize(tile);
    }
    Unlock();
    return totmemused;
}


int Heap::MemTotal()
{   WholesaleRegion *region;
    int n=0;

    for (region=regions; region; region=region->next)
        n += region->numbytes + sizeof(WholesaleRegion);
    return n;
}


void MallocCheck()
/* Checks the heap for consistency.  If there's an error, it calls  */
/* 'assert_failed()'.   MallocCheck() is guaranteed to find _all_   */
/* instances of corrupt data that can cause a subsequent malloc/    */
/* free/etc. to crash. */
{
    assert(_heapchk() == _HEAPOK);		// libc's heap
	if (MainHeap) MainHeap->Assert();	// My heap
}


void MallocLeakReport(str filename)
/* Print a report on memory usage.  filename==NULL means stdout. */
{   FILE *output;

    if (filename == NULL)
        output = stdout;
    else output = fopen(filename, "wt");
    assert(output != NULL);
    MainHeap->Report(output);
    if (output != stdout)
        fclose(output);
}


int MallocMemUsed()
{
    return MainHeap->MemUsed();
}


int MallocMemTotal()
{
    return MainHeap->MemTotal();
}



/* It's convenient to use multiple heaps if you want to free everything */
/* in one heap in one go, or you want to separately account for the     */
/* memory usage of different subsystems. */

void MallocSetHeap(int n)
/* Instead of using the main heap, use an alternative one for all */
/* subsequent allocs/deallocs. Later you can return to the main   */
/* heap using MallocSetHeap(0).  Important: don't try to free()   */
/* something allocated in another heap! */
{
    MainHeap = &AlternateHeap[n];
    if (MainHeap->regions == NULL)
        MainHeap->Initialise(65536);
}


void MallocFreeHeap()
/* Free all the memory used by this heap. Returns us to heap 0. */
{   int i;

    for (i=1; i < 10; i++)
        if (&AlternateHeap[i] == MainHeap)
            goto FOUND;
    assert(false);        // can't free Heap 0.
    return;
    FOUND:
    MainHeap->FreeEverything();
    MainHeap = &AlternateHeap[0];
}


void MallocOptimise()
{
	MainHeap->MergeTiles();
}


uint RoundUpForRealloc(uint size)
{
	return size;
}



/*-------------- The Bridge: --------------*/

static uint snazzyMsize (void* ptr)
{
    return Heap::TileSize((char*)ptr - 4) - 4;
}


void* snazzyMalloc(size_t size)
{   void *p;

    if (MainHeap == NULL)
        MallocSetHeap(0);
    p = MainHeap->malloc(size);
#ifdef ALIGN64
    assert(((long)p & 7) == 0);
#endif
    return p;
}


void* snazzyRealloc(void* ptr, size_t size)
{
    if (MainHeap == NULL)
        MallocSetHeap(0);
    assert(size < 0x20000000);
    return MainHeap->realloc(ptr, size);
}


static void* snazzyCalloc(size_t size, size_t n)
{
    if (MainHeap == NULL)
		MallocSetHeap(0);
    return MainHeap->calloc(size, n);
}


void snazzyFree(void *ptr)
{
    assert(MainHeap != NULL);
    MainHeap->free(ptr);
}


char* snazzyStrdup(const char* s)
{
	if (s == NULL)
		return NULL;
    assert(MainHeap != NULL);
    return strcpy((str)MainHeap->malloc(strlen(s) + 1), s);
}





/*----------------------------- List: -----------------------------*/

void* aListNext(void** Ap, int elsize)
{   int size;
    char* A;

    if (*Ap) {
        A = (char*)*Ap - sizeof(int);
        size = ++(*(int*)A);
		int bytesNeeded = size * elsize + sizeof(int);
		int bytesHave = snazzyMsize(A);
		if (bytesNeeded > bytesHave) {
			int request = bytesHave + bytesHave/2;
			while (request < bytesNeeded)
				request += request/2;
			A = (char*)snazzyRealloc(A, request);
		}
		A += sizeof(int);
        *Ap = A;
        return A + (size-1)*elsize;
    }
    else {
        A = (char*)snazzyMalloc(elsize + sizeof(int));
        size = 1;
        *(int*)A = 1;
        *Ap = A + sizeof(int);
        return *Ap;
    }
}


void* aListNextClear(void** Ap, int elsize)
{   int size;
    char* A;

    if (*Ap) {
        A = (char*)*Ap - sizeof(int);
        size = ++(*(int*)A);
		int bytesNeeded = size * elsize + sizeof(int);
		int bytesHave = snazzyMsize(A);
		if (bytesNeeded > bytesHave) {
			int request = bytesHave + bytesHave/2;
			while (request < bytesNeeded)
				request += request/2;
			A = (char*)snazzyRealloc(A, request);
		}
		A += sizeof(int);
        *Ap = A;
        memset(A + (size-1)*elsize, 0, elsize);
        return A + (size-1)*elsize;
    }
    else {
        A = (char*)snazzyCalloc(elsize + sizeof(int), 1);
        size = 1;
        *(int*)A = 1;
        *Ap = A + sizeof(int);
        return *Ap;
    }
}


void* aListIdx(void** Ap, int idx, int elsize)
{   int oldsize;
    char* A;

    if (idx > 1010001000) {   // What's a suitably ridiculously large limit?
        assert(false);
        return NULL;
    }
    if (*Ap) {
        A = (char*)*Ap - sizeof(int);
        oldsize = *(int*)A;
        if (idx < oldsize)
            return A + sizeof(int) + idx*elsize;
		int bytesNeeded = (idx+1) * elsize + sizeof(int);
		int bytesHave = snazzyMsize(A);
		if (bytesNeeded > bytesHave) {
			int request = bytesHave + bytesHave/2;
			while (request < bytesNeeded)
				request += request/2;
			A = (char*)snazzyRealloc(A, request);
		}
        *(int*)A = idx + 1;
        memset(A + sizeof(int) + oldsize*elsize, 0, (idx-oldsize+1) * elsize);
        *Ap = A + sizeof(int);
    }
    else {
        A = (char*)snazzyCalloc((idx+1)*elsize + sizeof(int), 1);
        *(int*)A = idx + 1;
        *Ap = A + sizeof(int);
    }
    return A + sizeof(int) + idx*elsize;
}


void* aListCopy(void* A, int elsize)
{   int size;
    void* B;

    if (A == NULL)
        return NULL;
    size = ((int*)A)[-1];
    size *= elsize;
    size += sizeof(int);
    B = snazzyMalloc(size);
    memcpy(B, (char*)A - sizeof(int), size);
    return (char*)B + sizeof(int);
}


void aListInsP(void** Ap, const void* p, int elsize)
/* Insert this pointer into a dynamic array of pointers. */
/* This fn doesn't work with arrays of integers etc.     */
/* If the ptr is already there, don't insert a 2nd copy. */
{   int i, size;
    void** A;

    assert(elsize == 4);
    A = (void**)*Ap;
    if (A == NULL) {
        *(const void**)aListNext(Ap, elsize) = p;
        return;
    }
    size = ((int*)A)[-1];
    for (i=0; i < size; i++) {
        if (A[i] == p)
            return;            /* It's already there. */
    }
    *(const void**)aListNext(Ap,elsize) = p;
}


void* aListInsN(void** Ap, int i, int elsize)
/* Insert a blank entry at position 'i'. */
{   int newsize;
    char* A;

    aListNext(Ap, elsize);
    A = (char*)*Ap;
    newsize = ((int*)A)[-1];
    assert(i < newsize);
    memmove(A + (i+1)*elsize, A + i*elsize, (newsize-i-1)*elsize);
    memset(A + i*elsize, 0, elsize);
    return A + i*elsize;
}


void aListDelN(void* Ap, int i, int elsize)
/* Delete element 'i' from this array. Shift everything up from the end */
/* to fill its place. If the last remaining element is deleted, then */
/* the array pointer is set to NULL (i.e. empty-set). */
{   void *A=*(void**)Ap;
    int size;

    size = ((int*)A)[-1];
    if (i < 0 or i >= size)
        return;
    if (size == 1) {
        snazzyFree((char *)A - sizeof(int));
        *(void**)Ap = NULL;
    }
    else {
        memmove((char*)A + i*elsize, (char*)A + (i+1)*elsize, (size-i-1) * elsize);
        ((int*)A)[-1]--;
    }
}


void aListDelP(void* Ap, const void *p, int elsize)
/* Delete this pointer from this array.  Only works with arrays of pointers. */
/* (If the pointer is in there multiple times, only delete the first entry). */
{   void **A=*(void***)Ap;
    int i, size;

    if (A == NULL)
        return;
    size = ((int*)A)[-1];
    for (i=0; i < size; i++) {
        if (A[i] == p) {
            memmove(A+i, A+i+1, (size-i-1)*sizeof(void*));
            ((int*)A)[-1]--;
            break;
            /*To delete all entries, you need:
            size--; i--;*/
        }
    }
    if (((int*)A)[-1] == 0) {
        snazzyFree((char*)A - sizeof(int));
        *(void**)Ap = NULL;
    }
}


void aListConcat(void* Ap, void *B, int elsize)
/* Concatenate all of 'B' into 'A'.  (Tack it onto the end of it). */
{   int sizeA, sizeB;
    void* A;

    A = *(void**)Ap;
    if (A == NULL) {
        *(void**)Ap = aListCopy(B, elsize);
        return;
    }
    if (B == NULL)
        return;
    sizeA = ((int*)A)[-1];
    sizeB = ((int*)B)[-1];
    A = *(void**)Ap = (char*)snazzyRealloc((char*)A - sizeof(int),
					(sizeA+sizeB) * elsize + sizeof(int))
                     + sizeof(int);
    memcpy((char*)A + sizeA*elsize, B, sizeB*elsize);
    ((int*)A)[-1] = sizeA + sizeB;
}


void aListMerge(void **&A, void **B)
/* For dynamic arrays of pointers or ints, form the set union.                 */
/* Merge 'B' into 'A'.  (Like ListConcat() but does not produce duplicates). */
{
	void *b = NULL;

	for (int each_aeli(b, B))
		ListInsP(A, b);
}


void aListFree(void* A)
{
    if (A == NULL)
        return;
    snazzyFree((char*)A - sizeof(int));
}


bool aListHasP(void* A0, const void* p)
/* Return 'yes' or 'no' depending on whether 'p' is inside 'A[]' or not. */
/* This only works with arrays of pointers. */
{   void** A=(void**)A0;
    int i,size;

    if (A == NULL)
        return no;
    size = ((int*)A)[-1];
    for (i=0; i < size; i++)
        if (A[i] == p)
            return yes;
    return no;
}


bool aListHasN(void* A0, int n)
/* Return 'yes' or 'no' depending on whether 'p' is inside 'A[]' or not. */
/* This only works with arrays of pointers. */
{   int* A=(int*)A0;
    int i,size;

    if (A == NULL)
        return no;
    size = ((int*)A)[-1];
    for (i=0; i < size; i++)
        if (A[i] == n)
            return yes;
    return no;
}


int aListFindP(void* A0, void* p)
/* Return the index where we can find 'p' in 'A0[]'. */
/* -1=not found. */
{   void** A=(void**)A0;
    int i,size;

    if (A == NULL)
        return -1;
    size = ((int*)A)[-1];
    for (i=0; i < size; i++)
        if (A[i] == p)
            return i;
    return -1;
}


int aListFindN(void* A0, int n)
/* Return the index where we can find 'p' in 'A0[]'. */
/* -1=not found. */
{   int* A=(int*)A0;
    int i,size;

    if (A == NULL)
        return -1;
    size = ((int*)A)[-1];
    for (i=0; i < size; i++)
        if (A[i] == n)
            return i;
    return -1;
}


void aListSetSize(void** Ap, int newsize, int elsize)
/* Set the size for this array.  It might be larger or smaller.      */
/* If we increase the size, it sets the extra stuff to all zeroes.  */
{   void* A=*Ap;
    int oldsize;

    oldsize = ListSize(A);
    if (newsize == oldsize)
        return;
    else if (newsize == 0) {
        snazzyFree((char*)A - sizeof(int));
        *Ap = NULL;
    }
    else if (newsize < oldsize)
        ((int*)A)[-1] = newsize;
    else {
        aListIdx(Ap, newsize-1, elsize);
    }
}


void aListFreeFree(void** A, bool are_dynarrays)
/* We have an array of pointers.  Free the objects as well as the array. */
/* Destructors are _not_ called, for the simple reason that constructors */
/* are not called when the arrays are created. */
{   int size;

    size = ListSize(A);
    while (size-- > 0) {
        if (are_dynarrays)
            aListFree(A[size]);
		else free(A[size]);
    }
    ListFree(A);
}






/* MSORT: copied from msort.c in unix/db */
/* tco> Made it re-entrant. */


struct Msorter {
    char* base;
    size_t size;
    int n;
    int* b1;
    int* b2;
    int (*compar)(const void *, const void *);
    
    void merge(int p, int q);
    void sort();
};


void Msorter::merge(int p, int q)
/* Sort from p to q-1 inclusive */
{   int ne = q-p;
    int ne2 = (ne+1)/2;
    int r = p+ne2;
    int p0;
    int p1;
    int p2;
    int m1;
    int m2;
    
    assert(ne > 1);
    
    if (r-p > 1) merge(p, r);
    if (q-r > 1) merge(r, q);
    
    p0 = p;
    p1 = p;
    p2 = r;
    
    do {
        m1 = b1[p1];
        m2 = b1[p2];
        if (compar(&base[m1 * size], &base[m2 * size]) > 0) {
            b2[p0++] = m2;
            p2++;
        } else {
            b2[p0++] = m1;
            p1++;
        }
    } while (p1 < r && p2 < q);
    
    while (p1 < r) b2[p0++] = b1[p1++];
    while (p2 < q) b2[p0++] = b1[p2++];
    
    assert(p0 == q);
    
    for (p0 = p; p0 < q; p0++)
        b1[p0] = b2[p0];
}


void Msorter::sort()
{   void* tmp;
    size_t i;
    int* p;

    b1 = (int*)snazzyMalloc(n * sizeof(*b1));
    b2 = (int*)snazzyMalloc(n * sizeof(*b2));
    tmp = snazzyMalloc(size);
    p = b1;
    for (i=0; i < n; i++)
        *p++ = i;
    merge(0, n);
    
    /* create inverse map */
    for (i=0; i < n; i++)
        b2[b1[i]] = i;
    
    /* swap things into place */
    for (i=0; i < n; i++) {
        size_t tb1 = b1[i];
        size_t tb2 = b2[i];
        
        if (tb1 == i) {
            assert(tb2 == i);
        } 
        else {
            assert((size_t)b2[tb1] == i);
            assert((size_t)b1[tb2] == i);
            memcpy(tmp, base + i * size, size);
            memmove(base + i * size, base + b1[i] * size, size);
            memcpy(base + tb1 * size, tmp, size);
            b1[tb2] = tb1;
            b2[tb1] = tb2;
        }
    }
    snazzyFree(b1);
    snazzyFree(b2);
    snazzyFree(tmp);
}


void msort(void *base, size_t n, size_t size, int (*compar) (const void *, const void *))
{   Msorter sorter;

    if (n <= 1)
        return;
    sorter.base = (char*)base;
    sorter.size = size;
    sorter.n = n;
    sorter.compar = compar;
    sorter.sort();
}


/*---------------------------------------------------*/

/* Don't release code with this in it!

void *operator new(size_t size)
{
	if(MainHeap == NULL)
		MallocSetHeap(0);

	return MainHeap->malloc(size);
}

void operator delete(void *mem)
{
	MainHeap->free(mem);
}

*/
