prosperon/source/engine/thirdparty/Chipmunk2D/src/cpHastySpace.c

// Copyright 2013 Howling Moon Software. All rights reserved.
// See http://chipmunk2d.net/legal.php for more information.

#include <stdlib.h>
#include <stdio.h>

//TODO: Move all the thread stuff to another file

//#include <sys/param.h >

#ifdef __APPLE__
#include <sys/sysctl.h>
#endif

#ifndef _WIN32
#include <pthread.h>
#elif defined(__MINGW32__)
#include <pthread.h>
#else
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif

#ifndef NOMINMAX
#define NOMINMAX
#endif

#include <process.h> // _beginthreadex
#include <windows.h>

#ifndef ETIMEDOUT
#define ETIMEDOUT 1
#endif

// Simple pthread implementation for Windows
// Made from scratch to avoid the LGPL licence from pthread-win32
enum {
	SIGNAL = 0,
	BROADCAST = 1,
	MAX_EVENTS = 2
};

typedef HANDLE pthread_t;
typedef struct
{
	// Based on http://www.cs.wustl.edu/~schmidt/win32-cv-1.html since Windows has no condition variable until NT6
	UINT waiters_count;
	// Count of the number of waiters.

	CRITICAL_SECTION waiters_count_lock;
	// Serialize access to <waiters_count_>.

	HANDLE events[MAX_EVENTS];
} pthread_cond_t;
typedef CRITICAL_SECTION pthread_mutex_t;

typedef struct {} pthread_condattr_t; // Dummy;

int pthread_cond_destroy(pthread_cond_t* cv)
{
	CloseHandle(cv->events[BROADCAST]);
	CloseHandle(cv->events[SIGNAL]);

	DeleteCriticalSection(&cv->waiters_count_lock);

	return 0;
}

int pthread_cond_init(pthread_cond_t* cv, const pthread_condattr_t* attr)
{
	// Initialize the count to 0.
	cv->waiters_count = 0;

	// Create an auto-reset event.
	cv->events[SIGNAL] = CreateEvent(NULL,  // no security
	                                 FALSE, // auto-reset event
	                                 FALSE, // non-signaled initially
	                                 NULL); // unnamed

	// Create a manual-reset event.
	cv->events[BROADCAST] = CreateEvent(NULL,  // no security
	                                    TRUE,  // manual-reset
	                                    FALSE, // non-signaled initially
	                                    NULL); // unnamed

	InitializeCriticalSection(&cv->waiters_count_lock);

	return 0;
}

int pthread_cond_broadcast(pthread_cond_t *cv)
{
	// Avoid race conditions.
	EnterCriticalSection(&cv->waiters_count_lock);
	int have_waiters = cv->waiters_count > 0;
	LeaveCriticalSection(&cv->waiters_count_lock);

	if (have_waiters)
		SetEvent(cv->events[BROADCAST]);

	return 0;
}

int pthread_cond_signal(pthread_cond_t* cv)
{
	// Avoid race conditions.
	EnterCriticalSection(&cv->waiters_count_lock);
	int have_waiters = cv->waiters_count > 0;
	LeaveCriticalSection(&cv->waiters_count_lock);

	if (have_waiters)
		SetEvent(cv->events[SIGNAL]);

	return 0;
}

int pthread_cond_wait(pthread_cond_t* cv, pthread_mutex_t* external_mutex)
{
	// Avoid race conditions.
	EnterCriticalSection(&cv->waiters_count_lock);
	cv->waiters_count++;
	LeaveCriticalSection(&cv->waiters_count_lock);

	// It's ok to release the <external_mutex> here since Win32
	// manual-reset events maintain state when used with
	// <SetEvent>.  This avoids the "lost wakeup" bug...
	LeaveCriticalSection(external_mutex);

	// Wait for either event to become signaled due to <pthread_cond_signal>
	// being called or <pthread_cond_broadcast> being called.
	int result = WaitForMultipleObjects(2, cv->events, FALSE, INFINITE);

	EnterCriticalSection(&cv->waiters_count_lock);
	cv->waiters_count--;
	int last_waiter =
		result == WAIT_OBJECT_0 + BROADCAST
		&& cv->waiters_count == 0;
	LeaveCriticalSection(&cv->waiters_count_lock);

	// Some thread called <pthread_cond_broadcast>.
	if (last_waiter)
		// We're the last waiter to be notified or to stop waiting, so
		// reset the manual event. 
		ResetEvent(cv->events[BROADCAST]);

	// Reacquire the <external_mutex>.
	EnterCriticalSection(external_mutex);

	return result == WAIT_TIMEOUT ? ETIMEDOUT : 0;
}

typedef struct {} pthread_mutexattr_t; //< Dummy

int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t* attr)
{
	InitializeCriticalSection(mutex);
	return 0;
}

int pthread_mutex_destroy(pthread_mutex_t* mutex)
{
	DeleteCriticalSection(mutex);
	return 0;
}

int pthread_mutex_lock(pthread_mutex_t* mutex)
{
	EnterCriticalSection(mutex);
	return 0;
}

int pthread_mutex_unlock(pthread_mutex_t* mutex)
{
	LeaveCriticalSection(mutex);
	return 0;
}

typedef struct {} pthread_attr_t;

typedef struct
{
	void *(*start_routine) (void *);
	void* arg;
} pthread_internal_thread;

unsigned int __stdcall ThreadProc(void* userdata)
{
	pthread_internal_thread* ud = (pthread_internal_thread*) userdata;
	ud->start_routine(ud->arg);

	free(ud);

	return 0;
}

int pthread_create(pthread_t* thread, const pthread_attr_t* attr, void *(*start_routine) (void *), void *arg)
{
	pthread_internal_thread* ud = (pthread_internal_thread*) malloc(sizeof(pthread_internal_thread));
	ud->start_routine = start_routine;
	ud->arg = arg;

	*thread = (HANDLE) (_beginthreadex(NULL, 0, &ThreadProc, ud, 0, NULL));
	if (!*thread)
		return 1;

	return 0;
}

int pthread_join(pthread_t thread, void **value_ptr)
{
	WaitForSingleObject(thread, INFINITE);
	CloseHandle(thread);

	return 0;
}

#endif

#include "chipmunk/chipmunk_private.h"
#include "chipmunk/cpHastySpace.h"


//MARK: ARM NEON Solver

#if __ARM_NEON__
#include <arm_neon.h>

// Tested and known to work fine with Clang 3.0 and GCC 4.2
// Doesn't work with Clang 1.6, and I have no idea why.
#if defined(__clang_major__) && __clang_major__ < 3
	#error Compiler not supported.
#endif

#if CP_USE_DOUBLES
	#if !__arm64
		#error Cannot use CP_USE_DOUBLES on 32 bit ARM.
	#endif
	
	typedef float64_t cpFloat_t;
	typedef float64x2_t cpFloatx2_t;
	#define vld vld1q_f64
	#define vdup_n vdupq_n_f64
	#define vst vst1q_f64
	#define vst_lane vst1q_lane_f64
	#define vadd vaddq_f64
	#define vsub vsubq_f64
	#define vpadd vpaddq_f64
	#define vmul vmulq_f64
	#define vmul_n vmulq_n_f64
	#define vneg vnegq_f64
	#define vget_lane vgetq_lane_f64
	#define vset_lane vsetq_lane_f64
	#define vmin vminq_f64
	#define vmax vmaxq_f64
	#define vrev(__a) __builtin_shufflevector(__a, __a, 1, 0)
#else
	typedef float32_t cpFloat_t;
	typedef float32x2_t cpFloatx2_t;
	#define vld vld1_f32
	#define vdup_n vdup_n_f32
	#define vst vst1_f32
	#define vst_lane vst1_lane_f32
	#define vadd vadd_f32
	#define vsub vsub_f32
	#define vpadd vpadd_f32
	#define vmul vmul_f32
	#define vmul_n vmul_n_f32
	#define vneg vneg_f32
	#define vget_lane vget_lane_f32
	#define vset_lane vset_lane_f32
	#define vmin vmin_f32
	#define vmax vmax_f32
	#define vrev vrev64_f32
#endif

// TODO could probably do better here, maybe using vcreate?
// especially for the constants
// Maybe use the {} notation for GCC/Clang?
static inline cpFloatx2_t
vmake(cpFloat_t x, cpFloat_t y)
{
//	cpFloatx2_t v = {};
//	v = vset_lane(x, v, 0);
//	v = vset_lane(y, v, 1);
//	
//	return v;
	
	// This might not be super compatible, but all the NEON headers use it...
	return (cpFloatx2_t){x, y};
}

static void
cpArbiterApplyImpulse_NEON(cpArbiter *arb)
{
	cpBody *a = arb->body_a;
	cpBody *b = arb->body_b;
	cpFloatx2_t surface_vr = vld((cpFloat_t *)&arb->surface_vr);
	cpFloatx2_t n = vld((cpFloat_t *)&arb->n);
	cpFloat_t friction = arb->u;
	
	int numContacts = arb->count;
	struct cpContact *contacts = arb->contacts;
	for(int i=0; i<numContacts; i++){
		struct cpContact *con = contacts + i;
		cpFloatx2_t r1 = vld((cpFloat_t *)&con->r1);
		cpFloatx2_t r2 = vld((cpFloat_t *)&con->r2);
		
		cpFloatx2_t perp = vmake(-1.0, 1.0);
		cpFloatx2_t r1p = vmul(vrev(r1), perp);
		cpFloatx2_t r2p = vmul(vrev(r2), perp);
		
		cpFloatx2_t vBias_a = vld((cpFloat_t *)&a->v_bias);
		cpFloatx2_t vBias_b = vld((cpFloat_t *)&b->v_bias);
		cpFloatx2_t wBias = vmake(a->w_bias, b->w_bias);
		
		cpFloatx2_t vb1 = vadd(vBias_a, vmul_n(r1p, vget_lane(wBias, 0)));
		cpFloatx2_t vb2 = vadd(vBias_b, vmul_n(r2p, vget_lane(wBias, 1)));
		cpFloatx2_t vbr = vsub(vb2, vb1);
		
		cpFloatx2_t v_a = vld((cpFloat_t *)&a->v);
		cpFloatx2_t v_b = vld((cpFloat_t *)&b->v);
		cpFloatx2_t w = vmake(a->w, b->w);
		cpFloatx2_t v1 = vadd(v_a, vmul_n(r1p, vget_lane(w, 0)));
		cpFloatx2_t v2 = vadd(v_b, vmul_n(r2p, vget_lane(w, 1)));
		cpFloatx2_t vr = vsub(v2, v1);
		
		cpFloatx2_t vbn_vrn = vpadd(vmul(vbr, n), vmul(vr, n));
		
		cpFloatx2_t v_offset = vmake(con->bias, -con->bounce);
		cpFloatx2_t jOld = vmake(con->jBias, con->jnAcc);
		cpFloatx2_t jbn_jn = vmul_n(vsub(v_offset, vbn_vrn), con->nMass);
		jbn_jn = vmax(vadd(jOld, jbn_jn), vdup_n(0.0));
		cpFloatx2_t jApply = vsub(jbn_jn, jOld);
		
		cpFloatx2_t t = vmul(vrev(n), perp);
		cpFloatx2_t vrt_tmp = vmul(vadd(vr, surface_vr), t);
		cpFloatx2_t vrt = vpadd(vrt_tmp, vrt_tmp);
		
		cpFloatx2_t jtOld = {}; jtOld = vset_lane(con->jtAcc, jtOld, 0);
		cpFloatx2_t jtMax = vrev(vmul_n(jbn_jn, friction));
		cpFloatx2_t jt = vmul_n(vrt, -con->tMass);
		jt = vmax(vneg(jtMax), vmin(vadd(jtOld, jt), jtMax));
		cpFloatx2_t jtApply = vsub(jt, jtOld);
		
		cpFloatx2_t i_inv = vmake(-a->i_inv, b->i_inv);
		cpFloatx2_t nperp = vmake(1.0, -1.0);
		
		cpFloatx2_t jBias = vmul_n(n, vget_lane(jApply, 0));
		cpFloatx2_t jBiasCross = vmul(vrev(jBias), nperp);
		cpFloatx2_t biasCrosses = vpadd(vmul(r1, jBiasCross), vmul(r2, jBiasCross));
		wBias = vadd(wBias, vmul(i_inv, biasCrosses));
		
		vBias_a = vsub(vBias_a, vmul_n(jBias, a->m_inv));
		vBias_b = vadd(vBias_b, vmul_n(jBias, b->m_inv));
		
		cpFloatx2_t j = vadd(vmul_n(n, vget_lane(jApply, 1)), vmul_n(t, vget_lane(jtApply, 0)));
		cpFloatx2_t jCross = vmul(vrev(j), nperp);
		cpFloatx2_t crosses = vpadd(vmul(r1, jCross), vmul(r2, jCross));
		w = vadd(w, vmul(i_inv, crosses));
		
		v_a = vsub(v_a, vmul_n(j, a->m_inv));
		v_b = vadd(v_b, vmul_n(j, b->m_inv));
		
		// TODO would moving these earlier help pipeline them better?
		vst((cpFloat_t *)&a->v_bias, vBias_a);
		vst((cpFloat_t *)&b->v_bias, vBias_b);
		vst_lane((cpFloat_t *)&a->w_bias, wBias, 0);
		vst_lane((cpFloat_t *)&b->w_bias, wBias, 1);
		
		vst((cpFloat_t *)&a->v, v_a);
		vst((cpFloat_t *)&b->v, v_b);
		vst_lane((cpFloat_t *)&a->w, w, 0);
		vst_lane((cpFloat_t *)&b->w, w, 1);
		
		vst_lane((cpFloat_t *)&con->jBias, jbn_jn, 0);
		vst_lane((cpFloat_t *)&con->jnAcc, jbn_jn, 1);
		vst_lane((cpFloat_t *)&con->jtAcc, jt, 0);
	}
}

#endif

//MARK: PThreads

// Right now using more than 2 threads probably wont help your performance any.
// If you are using a ridiculous number of iterations it could help though.
#define MAX_THREADS 2

struct ThreadContext {
	pthread_t thread;
	cpHastySpace *space;
	unsigned long thread_num;
};

typedef	void (*cpHastySpaceWorkFunction)(cpSpace *space, unsigned long worker, unsigned long worker_count);

struct cpHastySpace {
	cpSpace space;
	
	// Number of worker threads (including the main thread)
	unsigned long num_threads;
	
	// Number of worker threads currently executing. (also including the main thread)
	unsigned long num_working;
	
	// Number of constraints (plus contacts) that must exist per step to start the worker threads.
	unsigned long constraint_count_threshold;
	
	pthread_mutex_t mutex;
	pthread_cond_t cond_work, cond_resume;
	
	// Work function to invoke.
	cpHastySpaceWorkFunction work;
	
	struct ThreadContext workers[MAX_THREADS - 1];
};

static void *
WorkerThreadLoop(struct ThreadContext *context)
{
	cpHastySpace *hasty = context->space;
	
	unsigned long thread = context->thread_num;
	unsigned long num_threads = hasty->num_threads;
	
	for(;;){
		pthread_mutex_lock(&hasty->mutex); {
			if(--hasty->num_working == 0){
				pthread_cond_signal(&hasty->cond_resume);
			}
			
			pthread_cond_wait(&hasty->cond_work, &hasty->mutex);
		} pthread_mutex_unlock(&hasty->mutex);
		
		cpHastySpaceWorkFunction func = hasty->work;
		if(func){
			hasty->work(&hasty->space, thread, num_threads);
		} else {
			break;
		}
	}
	
	return NULL;
}

static void
RunWorkers(cpHastySpace *hasty, cpHastySpaceWorkFunction func)
{
	hasty->num_working = hasty->num_threads - 1;
	hasty->work = func;
	
	if(hasty->num_working > 0){
		pthread_mutex_lock(&hasty->mutex); {
			pthread_cond_broadcast(&hasty->cond_work);
		} pthread_mutex_unlock(&hasty->mutex);
		
		func((cpSpace *)hasty, 0, hasty->num_threads);
			
		pthread_mutex_lock(&hasty->mutex); {
			if(hasty->num_working > 0){
				pthread_cond_wait(&hasty->cond_resume, &hasty->mutex);
			}
		} pthread_mutex_unlock(&hasty->mutex);
	} else {
		func((cpSpace *)hasty, 0, hasty->num_threads);
	}
	
	hasty->work = NULL;
}

static void
Solver(cpSpace *space, unsigned long worker, unsigned long worker_count)
{
	cpArray *constraints = space->constraints;
	cpArray *arbiters = space->arbiters;
	
	cpFloat dt = space->curr_dt;
	unsigned long iterations = (space->iterations + worker_count - 1)/worker_count;
	
	for(unsigned long i=0; i<iterations; i++){
		for(int j=0; j<arbiters->num; j++){
			cpArbiter *arb = (cpArbiter *)arbiters->arr[j];
			#ifdef __ARM_NEON__
				cpArbiterApplyImpulse_NEON(arb);
			#else
				cpArbiterApplyImpulse(arb);
			#endif
		}
			
		for(int j=0; j<constraints->num; j++){
			cpConstraint *constraint = (cpConstraint *)constraints->arr[j];
			constraint->klass->applyImpulse(constraint, dt);
		}
	}
}

//MARK: Thread Management Functions

static void
HaltThreads(cpHastySpace *hasty)
{
	pthread_mutex_t *mutex = &hasty->mutex;
	pthread_mutex_lock(mutex); {
		hasty->work = NULL; // NULL work function means break and exit
		pthread_cond_broadcast(&hasty->cond_work);
	} pthread_mutex_unlock(mutex);
	
	for(unsigned long i=0; i<(hasty->num_threads-1); i++){
		pthread_join(hasty->workers[i].thread, NULL);
	}
}

void
cpHastySpaceSetThreads(cpSpace *space, unsigned long threads)
{
#if TARGET_IPHONE_SIMULATOR == 1
	// Individual values appear to be written non-atomically when compiled as debug for the simulator.
	// No idea why, so threads are disabled.
	threads = 1;
#endif	
	
	cpHastySpace *hasty = (cpHastySpace *)space;
	HaltThreads(hasty);
	
#ifdef __APPLE__
	if(threads == 0){
		size_t size = sizeof(threads);
		sysctlbyname("hw.ncpu", &threads, &size, NULL, 0);
	}
#else
	if(threads == 0) threads = 1;
#endif
	
	hasty->num_threads = (threads < MAX_THREADS ? threads : MAX_THREADS);
	hasty->num_working = hasty->num_threads - 1;
	
	// Create the worker threads and wait for them to signal ready.
	if(hasty->num_working > 0){
		pthread_mutex_lock(&hasty->mutex);
		for(unsigned long i=0; i<(hasty->num_threads-1); i++){
			hasty->workers[i].space = hasty;
			hasty->workers[i].thread_num = i + 1;
			
			pthread_create(&hasty->workers[i].thread, NULL, (void*(*)(void*))WorkerThreadLoop, &hasty->workers[i]);
		}
		
		pthread_cond_wait(&hasty->cond_resume, &hasty->mutex);
		pthread_mutex_unlock(&hasty->mutex);
	}
}

unsigned long
cpHastySpaceGetThreads(cpSpace *space)
{
	return ((cpHastySpace *)space)->num_threads;
}

//MARK: Overriden cpSpace Functions.

cpSpace *
cpHastySpaceNew(void)
{
	cpHastySpace *hasty = (cpHastySpace *)cpcalloc(1, sizeof(cpHastySpace));
	cpSpaceInit((cpSpace *)hasty);
	
	pthread_mutex_init(&hasty->mutex, NULL);
	pthread_cond_init(&hasty->cond_work, NULL);
	pthread_cond_init(&hasty->cond_resume, NULL);
	
	// TODO magic number, should test this more thoroughly.
	hasty->constraint_count_threshold = 50;
	
	// Default to 1 thread for determinism.
	hasty->num_threads = 1;
	cpHastySpaceSetThreads((cpSpace *)hasty, 1);

	return (cpSpace *)hasty;
}

void
cpHastySpaceFree(cpSpace *space)
{
	cpHastySpace *hasty = (cpHastySpace *)space;
	
	HaltThreads(hasty);
	
	pthread_mutex_destroy(&hasty->mutex);
	pthread_cond_destroy(&hasty->cond_work);
	pthread_cond_destroy(&hasty->cond_resume);
	
	cpSpaceFree(space);
}

void
cpHastySpaceStep(cpSpace *space, cpFloat dt)
{
	// don't step if the timestep is 0!
	if(dt == 0.0f) return;
	
	space->stamp++;
	
	cpFloat prev_dt = space->curr_dt;
	space->curr_dt = dt;
		
	cpArray *bodies = space->dynamicBodies;
	cpArray *constraints = space->constraints;
	cpArray *arbiters = space->arbiters;
	
	// Reset and empty the arbiter list.
	for(int i=0; i<arbiters->num; i++){
		cpArbiter *arb = (cpArbiter *)arbiters->arr[i];
		arb->state = CP_ARBITER_STATE_NORMAL;
		
		// If both bodies are awake, unthread the arbiter from the contact graph.
		if(!cpBodyIsSleeping(arb->body_a) && !cpBodyIsSleeping(arb->body_b)){
			cpArbiterUnthread(arb);
		}
	}
	arbiters->num = 0;
	
	cpSpaceLock(space); {
		// Integrate positions
		for(int i=0; i<bodies->num; i++){
			cpBody *body = (cpBody *)bodies->arr[i];
			body->position_func(body, dt);
		}
		
		// Find colliding pairs.
		cpSpacePushFreshContactBuffer(space);
		cpSpatialIndexEach(space->dynamicShapes, (cpSpatialIndexIteratorFunc)cpShapeUpdateFunc, NULL);
		cpSpatialIndexReindexQuery(space->dynamicShapes, (cpSpatialIndexQueryFunc)cpSpaceCollideShapes, space);
	} cpSpaceUnlock(space, cpFalse);
	
	// Rebuild the contact graph (and detect sleeping components if sleeping is enabled)
	cpSpaceProcessComponents(space, dt);
	
	cpSpaceLock(space); {
		// Clear out old cached arbiters and call separate callbacks
		cpHashSetFilter(space->cachedArbiters, (cpHashSetFilterFunc)cpSpaceArbiterSetFilter, space);

		// Prestep the arbiters and constraints.
		cpFloat slop = space->collisionSlop;
		cpFloat biasCoef = 1.0f - cpfpow(space->collisionBias, dt);
		for(int i=0; i<arbiters->num; i++){
			cpArbiterPreStep((cpArbiter *)arbiters->arr[i], dt, slop, biasCoef);
		}

		for(int i=0; i<constraints->num; i++){
			cpConstraint *constraint = (cpConstraint *)constraints->arr[i];
			
			cpConstraintPreSolveFunc preSolve = constraint->preSolve;
			if(preSolve) preSolve(constraint, space);
			
			constraint->klass->preStep(constraint, dt);
		}
	
		// Integrate velocities.
		cpFloat damping = cpfpow(space->damping, dt);
		cpVect gravity = space->gravity;
		for(int i=0; i<bodies->num; i++){
			cpBody *body = (cpBody *)bodies->arr[i];
			body->velocity_func(body, gravity, damping, dt);
		}
		
		// Apply cached impulses
		cpFloat dt_coef = (prev_dt == 0.0f ? 0.0f : dt/prev_dt);
		for(int i=0; i<arbiters->num; i++){
			cpArbiterApplyCachedImpulse((cpArbiter *)arbiters->arr[i], dt_coef);
		}
		
		for(int i=0; i<constraints->num; i++){
			cpConstraint *constraint = (cpConstraint *)constraints->arr[i];
			constraint->klass->applyCachedImpulse(constraint, dt_coef);
		}
		
		// Run the impulse solver.
		cpHastySpace *hasty = (cpHastySpace *)space;
		if((unsigned long)(arbiters->num + constraints->num) > hasty->constraint_count_threshold){
			RunWorkers(hasty, Solver);
		} else {
			Solver(space, 0, 1);
		}
		
		// Run the constraint post-solve callbacks
		for(int i=0; i<constraints->num; i++){
			cpConstraint *constraint = (cpConstraint *)constraints->arr[i];
			
			cpConstraintPostSolveFunc postSolve = constraint->postSolve;
			if(postSolve) postSolve(constraint, space);
		}
		
		// run the post-solve callbacks
		for(int i=0; i<arbiters->num; i++){
			cpArbiter *arb = (cpArbiter *) arbiters->arr[i];
			
			cpCollisionHandler *handler = arb->handler;
			handler->postSolveFunc(arb, space, handler->userData);
		}
	} cpSpaceUnlock(space, cpTrue);
}
Organized files 2022-01-19 16:43:21 -06:00			`// Copyright 2013 Howling Moon Software. All rights reserved.`
			`// See http://chipmunk2d.net/legal.php for more information.`

			`#include <stdlib.h>`
			`#include <stdio.h>`

			`//TODO: Move all the thread stuff to another file`

			`//#include <sys/param.h >`

			`#ifdef __APPLE__`
			`#include <sys/sysctl.h>`
			`#endif`

			`#ifndef _WIN32`
			`#include <pthread.h>`
			`#elif defined(__MINGW32__)`
			`#include <pthread.h>`
			`#else`
			`#ifndef WIN32_LEAN_AND_MEAN`
			`#define WIN32_LEAN_AND_MEAN`
			`#endif`

			`#ifndef NOMINMAX`
			`#define NOMINMAX`
			`#endif`

			`#include <process.h> // _beginthreadex`
			`#include <windows.h>`

			`#ifndef ETIMEDOUT`
			`#define ETIMEDOUT 1`
			`#endif`

			`// Simple pthread implementation for Windows`
			`// Made from scratch to avoid the LGPL licence from pthread-win32`
			`enum {`
			`SIGNAL = 0,`
			`BROADCAST = 1,`
			`MAX_EVENTS = 2`
			`};`

			`typedef HANDLE pthread_t;`
			`typedef struct`
			`{`
			`// Based on http://www.cs.wustl.edu/~schmidt/win32-cv-1.html since Windows has no condition variable until NT6`
			`UINT waiters_count;`
			`// Count of the number of waiters.`

			`CRITICAL_SECTION waiters_count_lock;`
			`// Serialize access to <waiters_count_>.`

			`HANDLE events[MAX_EVENTS];`
			`} pthread_cond_t;`
			`typedef CRITICAL_SECTION pthread_mutex_t;`

			`typedef struct {} pthread_condattr_t; // Dummy;`

			`int pthread_cond_destroy(pthread_cond_t* cv)`
			`{`
			`CloseHandle(cv->events[BROADCAST]);`
			`CloseHandle(cv->events[SIGNAL]);`

			`DeleteCriticalSection(&cv->waiters_count_lock);`

			`return 0;`
			`}`

			`int pthread_cond_init(pthread_cond_t* cv, const pthread_condattr_t* attr)`
			`{`
			`// Initialize the count to 0.`
			`cv->waiters_count = 0;`

			`// Create an auto-reset event.`
			`cv->events[SIGNAL] = CreateEvent(NULL, // no security`
			`FALSE, // auto-reset event`
			`FALSE, // non-signaled initially`
			`NULL); // unnamed`

			`// Create a manual-reset event.`
			`cv->events[BROADCAST] = CreateEvent(NULL, // no security`
			`TRUE, // manual-reset`
			`FALSE, // non-signaled initially`
			`NULL); // unnamed`

			`InitializeCriticalSection(&cv->waiters_count_lock);`

			`return 0;`
			`}`

			`int pthread_cond_broadcast(pthread_cond_t *cv)`
			`{`
			`// Avoid race conditions.`
			`EnterCriticalSection(&cv->waiters_count_lock);`
			`int have_waiters = cv->waiters_count > 0;`
			`LeaveCriticalSection(&cv->waiters_count_lock);`

			`if (have_waiters)`
			`SetEvent(cv->events[BROADCAST]);`

			`return 0;`
			`}`

			`int pthread_cond_signal(pthread_cond_t* cv)`
			`{`
			`// Avoid race conditions.`
			`EnterCriticalSection(&cv->waiters_count_lock);`
			`int have_waiters = cv->waiters_count > 0;`
			`LeaveCriticalSection(&cv->waiters_count_lock);`

			`if (have_waiters)`
			`SetEvent(cv->events[SIGNAL]);`

			`return 0;`
			`}`

			`int pthread_cond_wait(pthread_cond_t* cv, pthread_mutex_t* external_mutex)`
			`{`
			`// Avoid race conditions.`
			`EnterCriticalSection(&cv->waiters_count_lock);`
			`cv->waiters_count++;`
			`LeaveCriticalSection(&cv->waiters_count_lock);`

			`// It's ok to release the <external_mutex> here since Win32`
			`// manual-reset events maintain state when used with`
			`// <SetEvent>. This avoids the "lost wakeup" bug...`
			`LeaveCriticalSection(external_mutex);`

			`// Wait for either event to become signaled due to <pthread_cond_signal>`
			`// being called or <pthread_cond_broadcast> being called.`
			`int result = WaitForMultipleObjects(2, cv->events, FALSE, INFINITE);`

			`EnterCriticalSection(&cv->waiters_count_lock);`
			`cv->waiters_count--;`
			`int last_waiter =`
			`result == WAIT_OBJECT_0 + BROADCAST`
			`&& cv->waiters_count == 0;`
			`LeaveCriticalSection(&cv->waiters_count_lock);`

			`// Some thread called <pthread_cond_broadcast>.`
			`if (last_waiter)`
			`// We're the last waiter to be notified or to stop waiting, so`
			`// reset the manual event.`
			`ResetEvent(cv->events[BROADCAST]);`

			`// Reacquire the <external_mutex>.`
			`EnterCriticalSection(external_mutex);`

			`return result == WAIT_TIMEOUT ? ETIMEDOUT : 0;`
			`}`

			`typedef struct {} pthread_mutexattr_t; //< Dummy`

			`int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t* attr)`
			`{`
			`InitializeCriticalSection(mutex);`
			`return 0;`
			`}`

			`int pthread_mutex_destroy(pthread_mutex_t* mutex)`
			`{`
			`DeleteCriticalSection(mutex);`
			`return 0;`
			`}`

			`int pthread_mutex_lock(pthread_mutex_t* mutex)`
			`{`
			`EnterCriticalSection(mutex);`
			`return 0;`
			`}`

			`int pthread_mutex_unlock(pthread_mutex_t* mutex)`
			`{`
			`LeaveCriticalSection(mutex);`
			`return 0;`
			`}`

			`typedef struct {} pthread_attr_t;`

			`typedef struct`
			`{`
			`void (start_routine) (void *);`
			`void* arg;`
			`} pthread_internal_thread;`

			`unsigned int __stdcall ThreadProc(void* userdata)`
			`{`
			`pthread_internal_thread* ud = (pthread_internal_thread*) userdata;`
			`ud->start_routine(ud->arg);`

			`free(ud);`

			`return 0;`
			`}`

			`int pthread_create(pthread_t* thread, const pthread_attr_t* attr, void (start_routine) (void ), void arg)`
			`{`
			`pthread_internal_thread* ud = (pthread_internal_thread*) malloc(sizeof(pthread_internal_thread));`
			`ud->start_routine = start_routine;`
			`ud->arg = arg;`

			`*thread = (HANDLE) (_beginthreadex(NULL, 0, &ThreadProc, ud, 0, NULL));`
			`if (!*thread)`
			`return 1;`

			`return 0;`
			`}`

			`int pthread_join(pthread_t thread, void **value_ptr)`
			`{`
			`WaitForSingleObject(thread, INFINITE);`
			`CloseHandle(thread);`

			`return 0;`
			`}`

			`#endif`

			`#include "chipmunk/chipmunk_private.h"`
			`#include "chipmunk/cpHastySpace.h"`


			`//MARK: ARM NEON Solver`

			`#if __ARM_NEON__`
			`#include <arm_neon.h>`

			`// Tested and known to work fine with Clang 3.0 and GCC 4.2`
			`// Doesn't work with Clang 1.6, and I have no idea why.`
			`#if defined(__clang_major__) && __clang_major__ < 3`
			`#error Compiler not supported.`
			`#endif`

			`#if CP_USE_DOUBLES`
			`#if !__arm64`
			`#error Cannot use CP_USE_DOUBLES on 32 bit ARM.`
			`#endif`

			`typedef float64_t cpFloat_t;`
			`typedef float64x2_t cpFloatx2_t;`
			`#define vld vld1q_f64`
			`#define vdup_n vdupq_n_f64`
			`#define vst vst1q_f64`
			`#define vst_lane vst1q_lane_f64`
			`#define vadd vaddq_f64`
			`#define vsub vsubq_f64`
			`#define vpadd vpaddq_f64`
			`#define vmul vmulq_f64`
			`#define vmul_n vmulq_n_f64`
			`#define vneg vnegq_f64`
			`#define vget_lane vgetq_lane_f64`
			`#define vset_lane vsetq_lane_f64`
			`#define vmin vminq_f64`
			`#define vmax vmaxq_f64`
			`#define vrev(__a) __builtin_shufflevector(__a, __a, 1, 0)`
			`#else`
			`typedef float32_t cpFloat_t;`
			`typedef float32x2_t cpFloatx2_t;`
			`#define vld vld1_f32`
			`#define vdup_n vdup_n_f32`
			`#define vst vst1_f32`
			`#define vst_lane vst1_lane_f32`
			`#define vadd vadd_f32`
			`#define vsub vsub_f32`
			`#define vpadd vpadd_f32`
			`#define vmul vmul_f32`
			`#define vmul_n vmul_n_f32`
			`#define vneg vneg_f32`
			`#define vget_lane vget_lane_f32`
			`#define vset_lane vset_lane_f32`
			`#define vmin vmin_f32`
			`#define vmax vmax_f32`
			`#define vrev vrev64_f32`
			`#endif`

			`// TODO could probably do better here, maybe using vcreate?`
			`// especially for the constants`
			`// Maybe use the {} notation for GCC/Clang?`
			`static inline cpFloatx2_t`
			`vmake(cpFloat_t x, cpFloat_t y)`
			`{`
			`// cpFloatx2_t v = {};`
			`// v = vset_lane(x, v, 0);`
			`// v = vset_lane(y, v, 1);`
			`//`
			`// return v;`

			`// This might not be super compatible, but all the NEON headers use it...`
			`return (cpFloatx2_t){x, y};`
			`}`

			`static void`
			`cpArbiterApplyImpulse_NEON(cpArbiter *arb)`
			`{`
			`cpBody *a = arb->body_a;`
			`cpBody *b = arb->body_b;`
			`cpFloatx2_t surface_vr = vld((cpFloat_t *)&arb->surface_vr);`
			`cpFloatx2_t n = vld((cpFloat_t *)&arb->n);`
			`cpFloat_t friction = arb->u;`

			`int numContacts = arb->count;`
			`struct cpContact *contacts = arb->contacts;`
			`for(int i=0; i<numContacts; i++){`
			`struct cpContact *con = contacts + i;`
			`cpFloatx2_t r1 = vld((cpFloat_t *)&con->r1);`
			`cpFloatx2_t r2 = vld((cpFloat_t *)&con->r2);`

			`cpFloatx2_t perp = vmake(-1.0, 1.0);`
			`cpFloatx2_t r1p = vmul(vrev(r1), perp);`
			`cpFloatx2_t r2p = vmul(vrev(r2), perp);`

			`cpFloatx2_t vBias_a = vld((cpFloat_t *)&a->v_bias);`
			`cpFloatx2_t vBias_b = vld((cpFloat_t *)&b->v_bias);`
			`cpFloatx2_t wBias = vmake(a->w_bias, b->w_bias);`

			`cpFloatx2_t vb1 = vadd(vBias_a, vmul_n(r1p, vget_lane(wBias, 0)));`
			`cpFloatx2_t vb2 = vadd(vBias_b, vmul_n(r2p, vget_lane(wBias, 1)));`
			`cpFloatx2_t vbr = vsub(vb2, vb1);`

			`cpFloatx2_t v_a = vld((cpFloat_t *)&a->v);`
			`cpFloatx2_t v_b = vld((cpFloat_t *)&b->v);`
			`cpFloatx2_t w = vmake(a->w, b->w);`
			`cpFloatx2_t v1 = vadd(v_a, vmul_n(r1p, vget_lane(w, 0)));`
			`cpFloatx2_t v2 = vadd(v_b, vmul_n(r2p, vget_lane(w, 1)));`
			`cpFloatx2_t vr = vsub(v2, v1);`

			`cpFloatx2_t vbn_vrn = vpadd(vmul(vbr, n), vmul(vr, n));`

			`cpFloatx2_t v_offset = vmake(con->bias, -con->bounce);`
			`cpFloatx2_t jOld = vmake(con->jBias, con->jnAcc);`
			`cpFloatx2_t jbn_jn = vmul_n(vsub(v_offset, vbn_vrn), con->nMass);`
			`jbn_jn = vmax(vadd(jOld, jbn_jn), vdup_n(0.0));`
			`cpFloatx2_t jApply = vsub(jbn_jn, jOld);`

			`cpFloatx2_t t = vmul(vrev(n), perp);`
			`cpFloatx2_t vrt_tmp = vmul(vadd(vr, surface_vr), t);`
			`cpFloatx2_t vrt = vpadd(vrt_tmp, vrt_tmp);`

			`cpFloatx2_t jtOld = {}; jtOld = vset_lane(con->jtAcc, jtOld, 0);`
			`cpFloatx2_t jtMax = vrev(vmul_n(jbn_jn, friction));`
			`cpFloatx2_t jt = vmul_n(vrt, -con->tMass);`
			`jt = vmax(vneg(jtMax), vmin(vadd(jtOld, jt), jtMax));`
			`cpFloatx2_t jtApply = vsub(jt, jtOld);`

			`cpFloatx2_t i_inv = vmake(-a->i_inv, b->i_inv);`
			`cpFloatx2_t nperp = vmake(1.0, -1.0);`

			`cpFloatx2_t jBias = vmul_n(n, vget_lane(jApply, 0));`
			`cpFloatx2_t jBiasCross = vmul(vrev(jBias), nperp);`
			`cpFloatx2_t biasCrosses = vpadd(vmul(r1, jBiasCross), vmul(r2, jBiasCross));`
			`wBias = vadd(wBias, vmul(i_inv, biasCrosses));`

			`vBias_a = vsub(vBias_a, vmul_n(jBias, a->m_inv));`
			`vBias_b = vadd(vBias_b, vmul_n(jBias, b->m_inv));`

			`cpFloatx2_t j = vadd(vmul_n(n, vget_lane(jApply, 1)), vmul_n(t, vget_lane(jtApply, 0)));`
			`cpFloatx2_t jCross = vmul(vrev(j), nperp);`
			`cpFloatx2_t crosses = vpadd(vmul(r1, jCross), vmul(r2, jCross));`
			`w = vadd(w, vmul(i_inv, crosses));`

			`v_a = vsub(v_a, vmul_n(j, a->m_inv));`
			`v_b = vadd(v_b, vmul_n(j, b->m_inv));`

			`// TODO would moving these earlier help pipeline them better?`
			`vst((cpFloat_t *)&a->v_bias, vBias_a);`
			`vst((cpFloat_t *)&b->v_bias, vBias_b);`
			`vst_lane((cpFloat_t *)&a->w_bias, wBias, 0);`
			`vst_lane((cpFloat_t *)&b->w_bias, wBias, 1);`

			`vst((cpFloat_t *)&a->v, v_a);`
			`vst((cpFloat_t *)&b->v, v_b);`
			`vst_lane((cpFloat_t *)&a->w, w, 0);`
			`vst_lane((cpFloat_t *)&b->w, w, 1);`

			`vst_lane((cpFloat_t *)&con->jBias, jbn_jn, 0);`
			`vst_lane((cpFloat_t *)&con->jnAcc, jbn_jn, 1);`
			`vst_lane((cpFloat_t *)&con->jtAcc, jt, 0);`
			`}`
			`}`

			`#endif`

			`//MARK: PThreads`

			`// Right now using more than 2 threads probably wont help your performance any.`
			`// If you are using a ridiculous number of iterations it could help though.`
			`#define MAX_THREADS 2`

			`struct ThreadContext {`
			`pthread_t thread;`
			`cpHastySpace *space;`
			`unsigned long thread_num;`
			`};`

			`typedef void (cpHastySpaceWorkFunction)(cpSpace space, unsigned long worker, unsigned long worker_count);`

			`struct cpHastySpace {`
			`cpSpace space;`

			`// Number of worker threads (including the main thread)`
			`unsigned long num_threads;`

			`// Number of worker threads currently executing. (also including the main thread)`
			`unsigned long num_working;`

			`// Number of constraints (plus contacts) that must exist per step to start the worker threads.`
			`unsigned long constraint_count_threshold;`

			`pthread_mutex_t mutex;`
			`pthread_cond_t cond_work, cond_resume;`

			`// Work function to invoke.`
			`cpHastySpaceWorkFunction work;`

			`struct ThreadContext workers[MAX_THREADS - 1];`
			`};`

			`static void *`
			`WorkerThreadLoop(struct ThreadContext *context)`
			`{`
			`cpHastySpace *hasty = context->space;`

			`unsigned long thread = context->thread_num;`
			`unsigned long num_threads = hasty->num_threads;`

			`for(;;){`
			`pthread_mutex_lock(&hasty->mutex); {`
			`if(--hasty->num_working == 0){`
			`pthread_cond_signal(&hasty->cond_resume);`
			`}`

			`pthread_cond_wait(&hasty->cond_work, &hasty->mutex);`
			`} pthread_mutex_unlock(&hasty->mutex);`

			`cpHastySpaceWorkFunction func = hasty->work;`
			`if(func){`
			`hasty->work(&hasty->space, thread, num_threads);`
			`} else {`
			`break;`
			`}`
			`}`

			`return NULL;`
			`}`

			`static void`
			`RunWorkers(cpHastySpace *hasty, cpHastySpaceWorkFunction func)`
			`{`
			`hasty->num_working = hasty->num_threads - 1;`
			`hasty->work = func;`

			`if(hasty->num_working > 0){`
			`pthread_mutex_lock(&hasty->mutex); {`
			`pthread_cond_broadcast(&hasty->cond_work);`
			`} pthread_mutex_unlock(&hasty->mutex);`

			`func((cpSpace *)hasty, 0, hasty->num_threads);`

			`pthread_mutex_lock(&hasty->mutex); {`
			`if(hasty->num_working > 0){`
			`pthread_cond_wait(&hasty->cond_resume, &hasty->mutex);`
			`}`
			`} pthread_mutex_unlock(&hasty->mutex);`
			`} else {`
			`func((cpSpace *)hasty, 0, hasty->num_threads);`
			`}`

			`hasty->work = NULL;`
			`}`

			`static void`
			`Solver(cpSpace *space, unsigned long worker, unsigned long worker_count)`
			`{`
			`cpArray *constraints = space->constraints;`
			`cpArray *arbiters = space->arbiters;`

			`cpFloat dt = space->curr_dt;`
			`unsigned long iterations = (space->iterations + worker_count - 1)/worker_count;`

			`for(unsigned long i=0; i<iterations; i++){`
			`for(int j=0; j<arbiters->num; j++){`
			`cpArbiter arb = (cpArbiter )arbiters->arr[j];`
			`#ifdef __ARM_NEON__`
			`cpArbiterApplyImpulse_NEON(arb);`
			`#else`
			`cpArbiterApplyImpulse(arb);`
			`#endif`
			`}`

			`for(int j=0; j<constraints->num; j++){`
			`cpConstraint constraint = (cpConstraint )constraints->arr[j];`
			`constraint->klass->applyImpulse(constraint, dt);`
			`}`
			`}`
			`}`

			`//MARK: Thread Management Functions`

			`static void`
			`HaltThreads(cpHastySpace *hasty)`
			`{`
			`pthread_mutex_t *mutex = &hasty->mutex;`
			`pthread_mutex_lock(mutex); {`
			`hasty->work = NULL; // NULL work function means break and exit`
			`pthread_cond_broadcast(&hasty->cond_work);`
			`} pthread_mutex_unlock(mutex);`

			`for(unsigned long i=0; i<(hasty->num_threads-1); i++){`
			`pthread_join(hasty->workers[i].thread, NULL);`
			`}`
			`}`

			`void`
			`cpHastySpaceSetThreads(cpSpace *space, unsigned long threads)`
			`{`
			`#if TARGET_IPHONE_SIMULATOR == 1`
			`// Individual values appear to be written non-atomically when compiled as debug for the simulator.`
			`// No idea why, so threads are disabled.`
			`threads = 1;`
			`#endif`

			`cpHastySpace hasty = (cpHastySpace )space;`
			`HaltThreads(hasty);`

			`#ifdef __APPLE__`
			`if(threads == 0){`
			`size_t size = sizeof(threads);`
			`sysctlbyname("hw.ncpu", &threads, &size, NULL, 0);`
			`}`
			`#else`
			`if(threads == 0) threads = 1;`
			`#endif`

			`hasty->num_threads = (threads < MAX_THREADS ? threads : MAX_THREADS);`
			`hasty->num_working = hasty->num_threads - 1;`

			`// Create the worker threads and wait for them to signal ready.`
			`if(hasty->num_working > 0){`
			`pthread_mutex_lock(&hasty->mutex);`
			`for(unsigned long i=0; i<(hasty->num_threads-1); i++){`
			`hasty->workers[i].space = hasty;`
			`hasty->workers[i].thread_num = i + 1;`

			`pthread_create(&hasty->workers[i].thread, NULL, (void()(void*))WorkerThreadLoop, &hasty->workers[i]);`
			`}`

			`pthread_cond_wait(&hasty->cond_resume, &hasty->mutex);`
			`pthread_mutex_unlock(&hasty->mutex);`
			`}`
			`}`

			`unsigned long`
			`cpHastySpaceGetThreads(cpSpace *space)`
			`{`
			`return ((cpHastySpace *)space)->num_threads;`
			`}`

			`//MARK: Overriden cpSpace Functions.`

			`cpSpace *`
			`cpHastySpaceNew(void)`
			`{`
			`cpHastySpace hasty = (cpHastySpace )cpcalloc(1, sizeof(cpHastySpace));`
			`cpSpaceInit((cpSpace *)hasty);`

			`pthread_mutex_init(&hasty->mutex, NULL);`
			`pthread_cond_init(&hasty->cond_work, NULL);`
			`pthread_cond_init(&hasty->cond_resume, NULL);`

			`// TODO magic number, should test this more thoroughly.`
			`hasty->constraint_count_threshold = 50;`

			`// Default to 1 thread for determinism.`
			`hasty->num_threads = 1;`
			`cpHastySpaceSetThreads((cpSpace *)hasty, 1);`

			`return (cpSpace *)hasty;`
			`}`

			`void`
			`cpHastySpaceFree(cpSpace *space)`
			`{`
			`cpHastySpace hasty = (cpHastySpace )space;`

			`HaltThreads(hasty);`

			`pthread_mutex_destroy(&hasty->mutex);`
			`pthread_cond_destroy(&hasty->cond_work);`
			`pthread_cond_destroy(&hasty->cond_resume);`

			`cpSpaceFree(space);`
			`}`

			`void`
			`cpHastySpaceStep(cpSpace *space, cpFloat dt)`
			`{`
			`// don't step if the timestep is 0!`
			`if(dt == 0.0f) return;`

			`space->stamp++;`

			`cpFloat prev_dt = space->curr_dt;`
			`space->curr_dt = dt;`

			`cpArray *bodies = space->dynamicBodies;`
			`cpArray *constraints = space->constraints;`
			`cpArray *arbiters = space->arbiters;`

			`// Reset and empty the arbiter list.`
			`for(int i=0; i<arbiters->num; i++){`
			`cpArbiter arb = (cpArbiter )arbiters->arr[i];`
			`arb->state = CP_ARBITER_STATE_NORMAL;`

			`// If both bodies are awake, unthread the arbiter from the contact graph.`
			`if(!cpBodyIsSleeping(arb->body_a) && !cpBodyIsSleeping(arb->body_b)){`
			`cpArbiterUnthread(arb);`
			`}`
			`}`
			`arbiters->num = 0;`

			`cpSpaceLock(space); {`
			`// Integrate positions`
			`for(int i=0; i<bodies->num; i++){`
			`cpBody body = (cpBody )bodies->arr[i];`
			`body->position_func(body, dt);`
			`}`

			`// Find colliding pairs.`
			`cpSpacePushFreshContactBuffer(space);`
			`cpSpatialIndexEach(space->dynamicShapes, (cpSpatialIndexIteratorFunc)cpShapeUpdateFunc, NULL);`
			`cpSpatialIndexReindexQuery(space->dynamicShapes, (cpSpatialIndexQueryFunc)cpSpaceCollideShapes, space);`
			`} cpSpaceUnlock(space, cpFalse);`

			`// Rebuild the contact graph (and detect sleeping components if sleeping is enabled)`
			`cpSpaceProcessComponents(space, dt);`

			`cpSpaceLock(space); {`
			`// Clear out old cached arbiters and call separate callbacks`
			`cpHashSetFilter(space->cachedArbiters, (cpHashSetFilterFunc)cpSpaceArbiterSetFilter, space);`

			`// Prestep the arbiters and constraints.`
			`cpFloat slop = space->collisionSlop;`
			`cpFloat biasCoef = 1.0f - cpfpow(space->collisionBias, dt);`
			`for(int i=0; i<arbiters->num; i++){`
			`cpArbiterPreStep((cpArbiter *)arbiters->arr[i], dt, slop, biasCoef);`
			`}`

			`for(int i=0; i<constraints->num; i++){`
			`cpConstraint constraint = (cpConstraint )constraints->arr[i];`

			`cpConstraintPreSolveFunc preSolve = constraint->preSolve;`
			`if(preSolve) preSolve(constraint, space);`

			`constraint->klass->preStep(constraint, dt);`
			`}`

			`// Integrate velocities.`
			`cpFloat damping = cpfpow(space->damping, dt);`
			`cpVect gravity = space->gravity;`
			`for(int i=0; i<bodies->num; i++){`
			`cpBody body = (cpBody )bodies->arr[i];`
			`body->velocity_func(body, gravity, damping, dt);`
			`}`

			`// Apply cached impulses`
			`cpFloat dt_coef = (prev_dt == 0.0f ? 0.0f : dt/prev_dt);`
			`for(int i=0; i<arbiters->num; i++){`
			`cpArbiterApplyCachedImpulse((cpArbiter *)arbiters->arr[i], dt_coef);`
			`}`

			`for(int i=0; i<constraints->num; i++){`
			`cpConstraint constraint = (cpConstraint )constraints->arr[i];`
			`constraint->klass->applyCachedImpulse(constraint, dt_coef);`
			`}`

			`// Run the impulse solver.`
			`cpHastySpace hasty = (cpHastySpace )space;`
			`if((unsigned long)(arbiters->num + constraints->num) > hasty->constraint_count_threshold){`
			`RunWorkers(hasty, Solver);`
			`} else {`
			`Solver(space, 0, 1);`
			`}`

			`// Run the constraint post-solve callbacks`
			`for(int i=0; i<constraints->num; i++){`
			`cpConstraint constraint = (cpConstraint )constraints->arr[i];`

			`cpConstraintPostSolveFunc postSolve = constraint->postSolve;`
			`if(postSolve) postSolve(constraint, space);`
			`}`

			`// run the post-solve callbacks`
			`for(int i=0; i<arbiters->num; i++){`
			`cpArbiter arb = (cpArbiter ) arbiters->arr[i];`

			`cpCollisionHandler *handler = arb->handler;`
			`handler->postSolveFunc(arb, space, handler->userData);`
			`}`
			`} cpSpaceUnlock(space, cpTrue);`
			`}`