Lightweight Commit of libsquish

2025-12-10 05:57:49 +00:00 · 2015-04-04 13:19:51 -10:00
commit 592d3b8831
28 changed files with 3701 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,104 @@
+# cmake build file for squish
+# by Stefan Roettger (stefan@stereofx.org)
+# updated by Simon Brown (si@sjbrown.co.uk)
+
+# features:
+#   Xcode: builds universal binaries, uses SSE2 on i386 and Altivec on ppc
+#   Unix and VS: SSE2 support is enabled by default
+#   use BUILD_SQUISH_WITH_SSE2 and BUILD_SQUISH_WITH_ALTIVEC to override
+
+PROJECT(squish)
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8.3)
+
+OPTION(BUILD_SQUISH_WITH_SSE2 "Build with SSE2." ON)
+OPTION(BUILD_SQUISH_WITH_ALTIVEC "Build with Altivec." OFF)
+
+OPTION(BUILD_SHARED_LIBS "Build shared libraries." OFF)
+
+OPTION(BUILD_SQUISH_EXTRA "Build extra source code." OFF)
+
+IF (CMAKE_GENERATOR STREQUAL "Xcode")
+    SET(CMAKE_OSX_ARCHITECTURES "i386;ppc")
+ELSE (CMAKE_GENERATOR STREQUAL "Xcode")
+    IF (BUILD_SQUISH_WITH_SSE2 AND NOT WIN32)
+        ADD_DEFINITIONS(-DSQUISH_USE_SSE=2 -msse2)
+    ENDIF (BUILD_SQUISH_WITH_SSE2 AND NOT WIN32)
+    IF (BUILD_SQUISH_WITH_ALTIVEC AND NOT WIN32)
+        ADD_DEFINITIONS(-DSQUISH_USE_ALTIVEC=1 -maltivec)
+    ENDIF (BUILD_SQUISH_WITH_ALTIVEC AND NOT WIN32)
+ENDIF (CMAKE_GENERATOR STREQUAL "Xcode")
+
+SET(SQUISH_HDRS
+    squish.h
+    )
+
+SET(SQUISH_SRCS
+    alpha.cpp
+    alpha.h
+    clusterfit.cpp
+    clusterfit.h
+    colourblock.cpp
+    colourblock.h
+    colourfit.cpp
+    colourfit.h
+    colourset.cpp
+    colourset.h
+    maths.cpp
+    maths.h
+    rangefit.cpp
+    rangefit.h
+    simd.h
+    simd_float.h
+    simd_sse.h
+    simd_ve.h
+    singlecolourfit.cpp
+    singlecolourfit.h
+    singlecolourlookup.inl
+    squish.cpp
+    )
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+ADD_LIBRARY(squish ${SQUISH_SRCS} ${SQUISH_HDRS})
+
+SET_TARGET_PROPERTIES(
+    squish PROPERTIES
+    PUBLIC_HEADER "${SQUISH_HDRS}"
+    VERSION 0.0
+    SOVERSION 0.0
+    DEBUG_POSTFIX "d"
+    XCODE_ATTRIBUTE_GCC_PREPROCESSOR_DEFINITIONS "$(SQUISH_CPP_$(CURRENT_ARCH))"
+    XCODE_ATTRIBUTE_OTHER_CFLAGS "$(SQUISH_CFLAGS_$(CURRENT_ARCH))"
+    XCODE_ATTRIBUTE_SQUISH_CPP_i386 "SQUISH_USE_SSE=2"
+    XCODE_ATTRIBUTE_SQUISH_CFLAGS_i386 ""
+    XCODE_ATTRIBUTE_SQUISH_CPP_ppc "SQUISH_USE_ALTIVEC=1"
+    XCODE_ATTRIBUTE_SQUISH_CFLAGS_ppc "-maltivec"
+    )
+
+IF (BUILD_SQUISH_EXTRA)
+    SET(SQUISHTEST_SRCS extra/squishtest.cpp)
+
+    ADD_EXECUTABLE(squishtest ${SQUISHTEST_SRCS})
+    SET_TARGET_PROPERTIES(squishtest PROPERTIES DEBUG_POSTFIX "d")
+    TARGET_LINK_LIBRARIES(squishtest squish)
+
+    SET(SQUISHPNG_SRCS extra/squishpng.cpp)
+
+    FIND_PACKAGE(PNG)
+
+    IF (PNG_FOUND)
+        SET(CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES)
+        INCLUDE_DIRECTORIES(${PNG_INCLUDE_DIR})
+        ADD_EXECUTABLE(squishpng ${SQUISHPNG_SRCS})
+        SET_TARGET_PROPERTIES(squishpng PROPERTIES DEBUG_POSTFIX "d")
+        TARGET_LINK_LIBRARIES(squishpng squish ${PNG_LIBRARIES})
+    ENDIF (PNG_FOUND)
+ENDIF (BUILD_SQUISH_EXTRA)
+
+INSTALL(
+    TARGETS squish
+    LIBRARY DESTINATION lib
+    ARCHIVE DESTINATION lib
+    PUBLIC_HEADER DESTINATION include
+    )
--- a/CMakeModules/FindlibSquish.cmake
+++ b/CMakeModules/FindlibSquish.cmake
@@ -0,0 +1,14 @@
+# Defines
+#  LIBSQUISH_FOUND
+#  LIBSQUISH_INCLUDE_DIR
+#  LIBSQUISH_LIBRARIES
+
+FIND_PATH(LIBSQUISH_INCLUDE_DIR squish.h PATHS . squish .. ../squish DOC "Directory containing libSquish headers")
+FIND_LIBRARY(LIBSQUISH_LIBRARY NAMES squish libsquish PATHS . squish .. ../squish PATH_SUFFIXES lib lib64 release minsizerel relwithdebinfo DOC "Path to libSquish library")
+
+SET(LIBSQUISH_LIBRARIES ${LIBSQUISH_LIBRARY})
+
+IF (LIBSQUISH_LIBRARY AND LIBSQUISH_INCLUDE_DIR)
+   SET(LIBSQUISH_FOUND TRUE)
+   MESSAGE(STATUS "Found libSquish: ${LIBSQUISH_LIBRARY}")
+ENDIF (LIBSQUISH_LIBRARY AND LIBSQUISH_INCLUDE_DIR)
--- a/52
+++ b/52
@@ -0,0 +1,52 @@
+1.10
+* Iterative cluster fit is now considered to be a new compression mode
+* The core cluster fit is now 4x faster using contributions by Ignacio
+Castano from NVIDIA
+* The single colour lookup table has been halved by exploiting symmetry
+
+1.9
+* Added contributed SSE1 truncate implementation
+* Changed use of SQUISH_USE_SSE to be 1 for SSE and 2 for SSE2 instructions
+* Cluster fit is now iterative to further reduce image error
+
+1.8
+* Switched from using floor to trunc for much better SSE performance (again)
+* Xcode build now expects libpng in /usr/local for extra/squishpng
+
+1.7
+* Fixed floating-point equality issue in clusterfit sort (x86 affected only)
+* Implemented proper SSE(2) floor function for 50% speedup on SSE builds 
+* The range fit implementation now uses the correct colour metric
+
+1.6
+* Fixed bug in CompressImage where masked pixels were not skipped over
+* DXT3 and DXT5 alpha compression now properly use the mask to ignore pixels
+* Fixed major DXT1 bug that can generate unexpected transparent pixels
+
+1.5
+* Added CompressMasked function to handle incomplete DXT blocks more cleanly
+* Added kWeightColourByAlpha flag for better quality images when alpha blending
+
+1.4
+* Fixed stack overflow in rangefit
+
+1.3
+* Worked around SSE floor implementation bug, proper fix needed!
+* This release has visual studio and makefile builds that work
+
+1.2
+* Added provably optimal single colour compressor
+* Added extra/squishgen.cpp that generates single colour lookup tables
+
+1.1
+* Fixed a DXT1 colour output bug
+* Changed argument order for Decompress function to match Compress
+* Added GetStorageRequirements function
+* Added CompressImage function
+* Added DecompressImage function
+* Moved squishtool.cpp to extra/squishpng.cpp
+* Added extra/squishtest.cpp
+
+1.0
+* Initial release
+
--- a/20
+++ b/20
@@ -0,0 +1,20 @@
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/35
+++ b/35
@@ -0,0 +1,35 @@
+LICENSE
+-------
+
+The squish library is distributed under the terms and conditions of the MIT
+license. This license is specified at the top of each source file and must be
+preserved in its entirety.
+
+BUILDING AND INSTALLING THE LIBRARY
+-----------------------------------
+
+If you are using Visual Studio 2003 or above under Windows then load the Visual
+Studio 2003 project in the vs7 folder. By default, the library is built using
+SSE2 optimisations. To change this either change or remove the SQUISH_USE_SSE=2
+from the preprocessor symbols.
+
+If you are using a Mac then load the Xcode 2.2 project in the distribution. By
+default, the library is built using Altivec optimisations. To change this
+either change or remove SQUISH_USE_ALTIVEC=1 from the preprocessor symbols. I
+guess I'll have to think about changing this for the new Intel Macs that are
+rolling out...
+
+If you are using unix then first edit the config file in the base directory of
+the distribution, enabling Altivec or SSE with the USE_ALTIVEC or USE_SSE
+variables, and editing the optimisation flags passed to the C++ compiler if
+necessary. Then make can be used to build the library, and make install (from
+the superuser account) can be used to install (into /usr/local by default).
+
+REPORTING BUGS OR FEATURE REQUESTS
+----------------------------------
+
+Feedback can be sent to Simon Brown (the developer) at si@sjbrown.co.uk
+
+New releases are announced on the squish library homepage at
+http://sjbrown.co.uk/?code=squish
+
--- a/alpha.cpp
+++ b/alpha.cpp
@@ -0,0 +1,350 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "alpha.h"
+
+#include <climits>
+#include <algorithm>
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+void CompressAlphaDxt3( u8 const* rgba, int mask, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// quantise and pack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		float alpha1 = ( float )rgba[8*i + 3] * ( 15.0f/255.0f );
+		float alpha2 = ( float )rgba[8*i + 7] * ( 15.0f/255.0f );
+		int quant1 = FloatToInt( alpha1, 15 );
+		int quant2 = FloatToInt( alpha2, 15 );
+		
+		// set alpha to zero where masked
+		int bit1 = 1 << ( 2*i );
+		int bit2 = 1 << ( 2*i + 1 );
+		if( ( mask & bit1 ) == 0 )
+			quant1 = 0;
+		if( ( mask & bit2 ) == 0 )
+			quant2 = 0;
+
+		// pack into the byte
+		bytes[i] = ( u8 )( quant1 | ( quant2 << 4 ) );
+	}
+}
+
+void DecompressAlphaDxt3( u8* rgba, void const* block )
+{
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		u8 quant = bytes[i];
+		
+		// unpack the values
+		u8 lo = quant & 0x0f;
+		u8 hi = quant & 0xf0;
+
+		// convert back up to bytes
+		rgba[8*i + 3] = lo | ( lo << 4 );
+		rgba[8*i + 7] = hi | ( hi >> 4 );
+	}
+}
+
+static void FixRange( int& min, int& max, int steps )
+{
+	if( max - min < steps )
+		max = std::min( min + steps, 255 );
+	if( max - min < steps )
+		min = std::max( 0, max - steps );
+}
+
+static int FitCodes( u8 const* rgba, int mask, u8 const* codes, u8* indices )
+{
+	// fit each alpha value to the codebook
+	int err = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is valid
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+		{
+			// use the first code
+			indices[i] = 0;
+			continue;
+		}
+		
+		// find the least error and corresponding index
+		int value = rgba[4*i + 3];
+		int least = INT_MAX;
+		int index = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			// get the squared error from this code
+			int dist = ( int )value - ( int )codes[j];
+			dist *= dist;
+			
+			// compare with the best so far
+			if( dist < least )
+			{
+				least = dist;
+				index = j;
+			}
+		}
+		
+		// save this index and accumulate the error
+		indices[i] = ( u8 )index;
+		err += least;
+	}
+	
+	// return the total error
+	return err;
+}
+
+static void WriteAlphaBlock( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// write the first two bytes
+	bytes[0] = ( u8 )alpha0;
+	bytes[1] = ( u8 )alpha1;
+	
+	// pack the indices with 3 bits each
+	u8* dest = bytes + 2;
+	u8 const* src = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// pack 8 3-bit values
+		int value = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = *src++;
+			value |= ( index << 3*j );
+		}
+			
+		// store in 3 bytes
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = ( value >> 8*j ) & 0xff;
+			*dest++ = ( u8 )byte;
+		}
+	}
+}
+
+static void WriteAlphaBlock5( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 > alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else if( index <= 5 )
+				swapped[i] = 7 - index;
+			else 
+				swapped[i] = index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+static void WriteAlphaBlock7( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 < alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else
+				swapped[i] = 9 - index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+void CompressAlphaDxt5( u8 const* rgba, int mask, void* block )
+{
+	// get the range for 5-alpha and 7-alpha interpolation
+	int min5 = 255;
+	int max5 = 0;
+	int min7 = 255;
+	int max7 = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is valid
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+			continue;
+
+		// incorporate into the min/max
+		int value = rgba[4*i + 3];
+		if( value < min7 )
+			min7 = value;
+		if( value > max7 )
+			max7 = value;
+		if( value != 0 && value < min5 )
+			min5 = value;
+		if( value != 255 && value > max5 )
+			max5 = value;
+	}
+	
+	// handle the case that no valid range was found
+	if( min5 > max5 )
+		min5 = max5;
+	if( min7 > max7 )
+		min7 = max7;
+		
+	// fix the range to be the minimum in each case
+	FixRange( min5, max5, 5 );
+	FixRange( min7, max7, 7 );
+	
+	// set up the 5-alpha code book
+	u8 codes5[8];
+	codes5[0] = ( u8 )min5;
+	codes5[1] = ( u8 )max5;
+	for( int i = 1; i < 5; ++i )
+		codes5[1 + i] = ( u8 )( ( ( 5 - i )*min5 + i*max5 )/5 );
+	codes5[6] = 0;
+	codes5[7] = 255;
+	
+	// set up the 7-alpha code book
+	u8 codes7[8];
+	codes7[0] = ( u8 )min7;
+	codes7[1] = ( u8 )max7;
+	for( int i = 1; i < 7; ++i )
+		codes7[1 + i] = ( u8 )( ( ( 7 - i )*min7 + i*max7 )/7 );
+		
+	// fit the data to both code books
+	u8 indices5[16];
+	u8 indices7[16];
+	int err5 = FitCodes( rgba, mask, codes5, indices5 );
+	int err7 = FitCodes( rgba, mask, codes7, indices7 );
+	
+	// save the block with least error
+	if( err5 <= err7 )
+		WriteAlphaBlock5( min5, max5, indices5, block );
+	else
+		WriteAlphaBlock7( min7, max7, indices7, block );
+}
+
+void DecompressAlphaDxt5( u8* rgba, void const* block )
+{
+	// get the two alpha values
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	int alpha0 = bytes[0];
+	int alpha1 = bytes[1];
+	
+	// compare the values to build the codebook
+	u8 codes[8];
+	codes[0] = ( u8 )alpha0;
+	codes[1] = ( u8 )alpha1;
+	if( alpha0 <= alpha1 )
+	{
+		// use 5-alpha codebook
+		for( int i = 1; i < 5; ++i )
+			codes[1 + i] = ( u8 )( ( ( 5 - i )*alpha0 + i*alpha1 )/5 );
+		codes[6] = 0;
+		codes[7] = 255;
+	}
+	else
+	{
+		// use 7-alpha codebook
+		for( int i = 1; i < 7; ++i )
+			codes[1 + i] = ( u8 )( ( ( 7 - i )*alpha0 + i*alpha1 )/7 );
+	}
+	
+	// decode the indices
+	u8 indices[16];
+	u8 const* src = bytes + 2;
+	u8* dest = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// grab 3 bytes
+		int value = 0;
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = *src++;
+			value |= ( byte << 8*j );
+		}
+		
+		// unpack 8 3-bit values from it
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = ( value >> 3*j ) & 0x7;
+			*dest++ = ( u8 )index;
+		}
+	}
+	
+	// write out the indexed codebook values
+	for( int i = 0; i < 16; ++i )
+		rgba[4*i + 3] = codes[indices[i]];
+}
+
+} // namespace squish
--- a/alpha.h
+++ b/alpha.h
@@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_ALPHA_H
+#define SQUISH_ALPHA_H
+
+#include "squish.h"
+
+namespace squish {
+
+void CompressAlphaDxt3( u8 const* rgba, int mask, void* block );
+void CompressAlphaDxt5( u8 const* rgba, int mask, void* block );
+
+void DecompressAlphaDxt3( u8* rgba, void const* block );
+void DecompressAlphaDxt5( u8* rgba, void const* block );
+
+} // namespace squish
+
+#endif // ndef SQUISH_ALPHA_H
--- a/clusterfit.cpp
+++ b/clusterfit.cpp
@@ -0,0 +1,392 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2007 Ignacio Castano                   icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "clusterfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+ClusterFit::ClusterFit( ColourSet const* colours, int flags, float* metric ) 
+  : ColourFit( colours, flags )
+{
+	// set the iteration count
+	m_iterationCount = ( m_flags & kColourIterativeClusterFit ) ? kMaxIterations : 1;
+
+	// initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+	if( metric )
+		m_metric = Vec4( metric[0], metric[1], metric[2], 1.0f );
+	else
+		m_metric = VEC4_CONST( 1.0f );	
+
+	// initialise the best error
+	m_besterror = VEC4_CONST( FLT_MAX );
+
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+	
+	// compute the principle component
+	m_principle = ComputePrincipleComponent( covariance );
+}
+
+bool ClusterFit::ConstructOrdering( Vec3 const& axis, int iteration )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+
+	// build the list of dot products
+	float dps[16];
+	u8* order = ( u8* )m_order + 16*iteration;
+	for( int i = 0; i < count; ++i )
+	{
+		dps[i] = Dot( values[i], axis );
+		order[i] = ( u8 )i;
+	}
+		
+	// stable sort using them
+	for( int i = 0; i < count; ++i )
+	{
+		for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
+		{
+			std::swap( dps[j], dps[j - 1] );
+			std::swap( order[j], order[j - 1] );
+		}
+	}
+	
+	// check this ordering is unique
+	for( int it = 0; it < iteration; ++it )
+	{
+		u8 const* prev = ( u8* )m_order + 16*it;
+		bool same = true;
+		for( int i = 0; i < count; ++i )
+		{
+			if( order[i] != prev[i] )
+			{
+				same = false;
+				break;
+			}
+		}
+		if( same )
+			return false;
+	}
+	
+	// copy the ordering and weight all the points
+	Vec3 const* unweighted = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	m_xsum_wsum = VEC4_CONST( 0.0f );
+	for( int i = 0; i < count; ++i )
+	{
+		int j = order[i];
+		Vec4 p( unweighted[j].X(), unweighted[j].Y(), unweighted[j].Z(), 1.0f );
+		Vec4 w( weights[j] );
+		Vec4 x = p*w;
+		m_points_weights[i] = x;
+		m_xsum_wsum += x;
+	}
+	return true;
+}
+
+void ClusterFit::Compress3( void* block )
+{
+	// declare variables
+	int const count = m_colours->GetCount();
+	Vec4 const two = VEC4_CONST( 2.0 );
+	Vec4 const one = VEC4_CONST( 1.0f );
+	Vec4 const half_half2( 0.5f, 0.5f, 0.5f, 0.25f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+	// prepare an ordering using the principle axis
+	ConstructOrdering( m_principle, 0 );
+	
+	// check all possible clusters and iterate on the total order
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = m_besterror;
+	u8 bestindices[16];
+	int bestiteration = 0;
+	int besti = 0, bestj = 0;
+	
+	// loop over iterations (we avoid the case that all points in first or last cluster)
+	for( int iterationIndex = 0;; )
+	{
+		// first cluster [0,i) is at the start
+		Vec4 part0 = VEC4_CONST( 0.0f );
+		for( int i = 0; i < count; ++i )
+		{
+			// second cluster [i,j) is half along
+			Vec4 part1 = ( i == 0 ) ? m_points_weights[0] : VEC4_CONST( 0.0f );
+			int jmin = ( i == 0 ) ? 1 : i;
+			for( int j = jmin;; )
+			{
+				// last cluster [j,count) is at the end
+				Vec4 part2 = m_xsum_wsum - part1 - part0;
+				
+				// compute least squares terms directly
+				Vec4 alphax_sum = MultiplyAdd( part1, half_half2, part0 );
+				Vec4 alpha2_sum = alphax_sum.SplatW();
+
+				Vec4 betax_sum = MultiplyAdd( part1, half_half2, part2 );
+				Vec4 beta2_sum = betax_sum.SplatW();
+
+				Vec4 alphabeta_sum = ( part1*half_half2 ).SplatW();
+
+				// compute the least-squares optimal points
+				Vec4 factor = Reciprocal( NegativeMultiplySubtract( alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum ) );
+				Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_sum, alphax_sum*beta2_sum )*factor;
+				Vec4 b = NegativeMultiplySubtract( alphax_sum, alphabeta_sum, betax_sum*alpha2_sum )*factor;
+
+				// clamp to the grid
+				a = Min( one, Max( zero, a ) );
+				b = Min( one, Max( zero, b ) );
+				a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+				b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+				
+				// compute the error (we skip the constant xxsum)
+				Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+				Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+				Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+				Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+				// apply the metric to the error term
+				Vec4 e5 = e4*m_metric;
+				Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+				
+				// keep the solution if it wins
+				if( CompareAnyLessThan( error, besterror ) )
+				{
+					beststart = a;
+					bestend = b;
+					besti = i;
+					bestj = j;
+					besterror = error;
+					bestiteration = iterationIndex;
+				}
+
+				// advance
+				if( j == count )
+					break;
+				part1 += m_points_weights[j];
+				++j;
+			}
+
+			// advance
+			part0 += m_points_weights[i];
+		}
+		
+		// stop if we didn't improve in this iteration
+		if( bestiteration != iterationIndex )
+			break;
+			
+		// advance if possible
+		++iterationIndex;
+		if( iterationIndex == m_iterationCount )
+			break;
+			
+		// stop if a new iteration is an ordering that has already been tried
+		Vec3 axis = ( bestend - beststart ).GetVec3();
+		if( !ConstructOrdering( axis, iterationIndex ) )
+			break;
+	}
+		
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// remap the indices
+		u8 const* order = ( u8* )m_order + 16*bestiteration;
+
+		u8 unordered[16];
+		for( int m = 0; m < besti; ++m )
+			unordered[order[m]] = 0;
+		for( int m = besti; m < bestj; ++m )
+			unordered[order[m]] = 2;
+		for( int m = bestj; m < count; ++m )
+			unordered[order[m]] = 1;
+
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+void ClusterFit::Compress4( void* block )
+{
+	// declare variables
+	int const count = m_colours->GetCount();
+	Vec4 const two = VEC4_CONST( 2.0f );
+	Vec4 const one = VEC4_CONST( 1.0f );
+	Vec4 const onethird_onethird2( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+	Vec4 const twothirds_twothirds2( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+	Vec4 const twonineths = VEC4_CONST( 2.0f/9.0f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+	// prepare an ordering using the principle axis
+	ConstructOrdering( m_principle, 0 );
+	
+	// check all possible clusters and iterate on the total order
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = m_besterror;
+	u8 bestindices[16];
+	int bestiteration = 0;
+	int besti = 0, bestj = 0, bestk = 0;
+	
+	// loop over iterations (we avoid the case that all points in first or last cluster)
+	for( int iterationIndex = 0;; )
+	{
+		// first cluster [0,i) is at the start
+		Vec4 part0 = VEC4_CONST( 0.0f );
+		for( int i = 0; i < count; ++i )
+		{
+			// second cluster [i,j) is one third along
+			Vec4 part1 = VEC4_CONST( 0.0f );
+			for( int j = i;; )
+			{
+				// third cluster [j,k) is two thirds along
+				Vec4 part2 = ( j == 0 ) ? m_points_weights[0] : VEC4_CONST( 0.0f );
+				int kmin = ( j == 0 ) ? 1 : j;
+				for( int k = kmin;; )
+				{
+					// last cluster [k,count) is at the end
+					Vec4 part3 = m_xsum_wsum - part2 - part1 - part0;
+
+					// compute least squares terms directly
+					Vec4 const alphax_sum = MultiplyAdd( part2, onethird_onethird2, MultiplyAdd( part1, twothirds_twothirds2, part0 ) );
+					Vec4 const alpha2_sum = alphax_sum.SplatW();
+					
+					Vec4 const betax_sum = MultiplyAdd( part1, onethird_onethird2, MultiplyAdd( part2, twothirds_twothirds2, part3 ) );
+					Vec4 const beta2_sum = betax_sum.SplatW();
+					
+					Vec4 const alphabeta_sum = twonineths*( part1 + part2 ).SplatW();
+
+					// compute the least-squares optimal points
+					Vec4 factor = Reciprocal( NegativeMultiplySubtract( alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum ) );
+					Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_sum, alphax_sum*beta2_sum )*factor;
+					Vec4 b = NegativeMultiplySubtract( alphax_sum, alphabeta_sum, betax_sum*alpha2_sum )*factor;
+
+					// clamp to the grid
+					a = Min( one, Max( zero, a ) );
+					b = Min( one, Max( zero, b ) );
+					a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+					b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+					
+					// compute the error (we skip the constant xxsum)
+					Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+					Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+					Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+					Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+					// apply the metric to the error term
+					Vec4 e5 = e4*m_metric;
+					Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+
+					// keep the solution if it wins
+					if( CompareAnyLessThan( error, besterror ) )
+					{
+						beststart = a;
+						bestend = b;
+						besterror = error;
+						besti = i;
+						bestj = j;
+						bestk = k;
+						bestiteration = iterationIndex;
+					}
+
+					// advance
+					if( k == count )
+						break;
+					part2 += m_points_weights[k];
+					++k;
+				}
+
+				// advance
+				if( j == count )
+					break;
+				part1 += m_points_weights[j];
+				++j;
+			}
+
+			// advance
+			part0 += m_points_weights[i];
+		}
+		
+		// stop if we didn't improve in this iteration
+		if( bestiteration != iterationIndex )
+			break;
+			
+		// advance if possible
+		++iterationIndex;
+		if( iterationIndex == m_iterationCount )
+			break;
+			
+		// stop if a new iteration is an ordering that has already been tried
+		Vec3 axis = ( bestend - beststart ).GetVec3();
+		if( !ConstructOrdering( axis, iterationIndex ) )
+			break;
+	}
+
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// remap the indices
+		u8 const* order = ( u8* )m_order + 16*bestiteration;
+
+		u8 unordered[16];
+		for( int m = 0; m < besti; ++m )
+			unordered[order[m]] = 0;
+		for( int m = besti; m < bestj; ++m )
+			unordered[order[m]] = 2;
+		for( int m = bestj; m < bestk; ++m )
+			unordered[order[m]] = 3;
+		for( int m = bestk; m < count; ++m )
+			unordered[order[m]] = 1;
+
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+} // namespace squish
--- a/clusterfit.h
+++ b/clusterfit.h
@@ -0,0 +1,61 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2007 Ignacio Castano                   icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CLUSTERFIT_H
+#define SQUISH_CLUSTERFIT_H
+
+#include "squish.h"
+#include "maths.h"
+#include "simd.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ClusterFit : public ColourFit
+{
+public:
+	ClusterFit( ColourSet const* colours, int flags, float* metric );
+	
+private:
+	bool ConstructOrdering( Vec3 const& axis, int iteration );
+
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+
+	enum { kMaxIterations = 8 };
+
+	int m_iterationCount;
+	Vec3 m_principle;
+	u8 m_order[16*kMaxIterations];
+	Vec4 m_points_weights[16];
+	Vec4 m_xsum_wsum;
+	Vec4 m_metric;
+	Vec4 m_besterror;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_CLUSTERFIT_H
--- a/colourblock.cpp
+++ b/colourblock.cpp
@@ -0,0 +1,214 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourblock.h"
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+static int FloatTo565( Vec3::Arg colour )
+{
+	// get the components in the correct range
+	int r = FloatToInt( 31.0f*colour.X(), 31 );
+	int g = FloatToInt( 63.0f*colour.Y(), 63 );
+	int b = FloatToInt( 31.0f*colour.Z(), 31 );
+	
+	// pack into a single value
+	return ( r << 11 ) | ( g << 5 ) | b;
+}
+
+static void WriteColourBlock( int a, int b, u8* indices, void* block )
+{
+	// get the block as bytes
+	u8* bytes = ( u8* )block;
+
+	// write the endpoints
+	bytes[0] = ( u8 )( a & 0xff );
+	bytes[1] = ( u8 )( a >> 8 );
+	bytes[2] = ( u8 )( b & 0xff );
+	bytes[3] = ( u8 )( b >> 8 );
+	
+	// write the indices
+	for( int i = 0; i < 4; ++i )
+	{
+		u8 const* ind = indices + 4*i;
+		bytes[4 + i] = ind[0] | ( ind[1] << 2 ) | ( ind[2] << 4 ) | ( ind[3] << 6 );
+	}
+}
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a <= b )
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	else
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+		{
+			if( indices[i] == 0 )
+				remapped[i] = 1;
+			else if( indices[i] == 1 )
+				remapped[i] = 0;
+			else
+				remapped[i] = indices[i];
+		}
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a < b )
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = ( indices[i] ^ 0x1 ) & 0x3;
+	}
+	else if( a == b )
+	{
+		// use index 0
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = 0;
+	}
+	else
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+static int Unpack565( u8 const* packed, u8* colour )
+{
+	// build the packed value
+	int value = ( int )packed[0] | ( ( int )packed[1] << 8 );
+	
+	// get the components in the stored range
+	u8 red = ( u8 )( ( value >> 11 ) & 0x1f );
+	u8 green = ( u8 )( ( value >> 5 ) & 0x3f );
+	u8 blue = ( u8 )( value & 0x1f );
+
+	// scale up to 8 bits
+	colour[0] = ( red << 3 ) | ( red >> 2 );
+	colour[1] = ( green << 2 ) | ( green >> 4 );
+	colour[2] = ( blue << 3 ) | ( blue >> 2 );
+	colour[3] = 255;
+	
+	// return the value
+	return value;
+}
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 )
+{
+	// get the block bytes
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the endpoints
+	u8 codes[16];
+	int a = Unpack565( bytes, codes );
+	int b = Unpack565( bytes + 2, codes + 4 );
+	
+	// generate the midpoints
+	for( int i = 0; i < 3; ++i )
+	{
+		int c = codes[i];
+		int d = codes[4 + i];
+
+		if( isDxt1 && a <= b )
+		{
+			codes[8 + i] = ( u8 )( ( c + d )/2 );
+			codes[12 + i] = 0;
+		}
+		else
+		{
+			codes[8 + i] = ( u8 )( ( 2*c + d )/3 );
+			codes[12 + i] = ( u8 )( ( c + 2*d )/3 );
+		}
+	}
+	
+	// fill in alpha for the intermediate values
+	codes[8 + 3] = 255;
+	codes[12 + 3] = ( isDxt1 && a <= b ) ? 0 : 255;
+	
+	// unpack the indices
+	u8 indices[16];
+	for( int i = 0; i < 4; ++i )
+	{
+		u8* ind = indices + 4*i;
+		u8 packed = bytes[4 + i];
+		
+		ind[0] = packed & 0x3;
+		ind[1] = ( packed >> 2 ) & 0x3;
+		ind[2] = ( packed >> 4 ) & 0x3;
+		ind[3] = ( packed >> 6 ) & 0x3;
+	}
+
+	// store out the colours
+	for( int i = 0; i < 16; ++i )
+	{
+		u8 offset = 4*indices[i];
+		for( int j = 0; j < 4; ++j )
+			rgba[4*i + j] = codes[offset + j];
+	}
+}
+
+} // namespace squish
--- a/colourblock.h
+++ b/colourblock.h
@@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURBLOCK_H
+#define SQUISH_COLOURBLOCK_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 );
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURBLOCK_H
--- a/colourfit.cpp
+++ b/colourfit.cpp
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourfit.h"
+#include "colourset.h"
+
+namespace squish {
+
+ColourFit::ColourFit( ColourSet const* colours, int flags ) 
+  : m_colours( colours ), 
+	m_flags( flags )
+{
+}
+
+ColourFit::~ColourFit()
+{
+}
+
+void ColourFit::Compress( void* block )
+{
+	bool isDxt1 = ( ( m_flags & kDxt1 ) != 0 );
+	if( isDxt1 )
+	{
+		Compress3( block );
+		if( !m_colours->IsTransparent() )
+			Compress4( block );
+	}
+	else
+		Compress4( block );
+}
+
+} // namespace squish
--- a/colourfit.h
+++ b/colourfit.h
@@ -0,0 +1,56 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURFIT_H
+#define SQUISH_COLOURFIT_H
+
+#include "squish.h"
+#include "maths.h"
+
+#include <climits>
+
+namespace squish {
+
+class ColourSet;
+
+class ColourFit
+{
+public:
+	ColourFit( ColourSet const* colours, int flags );
+	virtual ~ColourFit();
+
+	void Compress( void* block );
+
+protected:
+	virtual void Compress3( void* block ) = 0;
+	virtual void Compress4( void* block ) = 0;
+
+	ColourSet const* m_colours;
+	int m_flags;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURFIT_H
--- a/colourset.cpp
+++ b/colourset.cpp
@@ -0,0 +1,121 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourset.h"
+
+namespace squish {
+
+ColourSet::ColourSet( u8 const* rgba, int mask, int flags )
+  : m_count( 0 ), 
+	m_transparent( false )
+{
+	// check the compression mode for dxt1
+	bool isDxt1 = ( ( flags & kDxt1 ) != 0 );
+	bool weightByAlpha = ( ( flags & kWeightColourByAlpha ) != 0 );
+
+	// create the minimal set
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is enabled
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+		{
+			m_remap[i] = -1;
+			continue;
+		}
+	
+		// check for transparent pixels when using dxt1
+		if( isDxt1 && rgba[4*i + 3] < 128 )
+		{
+			m_remap[i] = -1;
+			m_transparent = true;
+			continue;
+		}
+
+		// loop over previous points for a match
+		for( int j = 0;; ++j )
+		{
+			// allocate a new point
+			if( j == i )
+			{
+				// normalise coordinates to [0,1]
+				float x = ( float )rgba[4*i] / 255.0f;
+				float y = ( float )rgba[4*i + 1] / 255.0f;
+				float z = ( float )rgba[4*i + 2] / 255.0f;
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// add the point
+				m_points[m_count] = Vec3( x, y, z );
+				m_weights[m_count] = ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = m_count;
+				
+				// advance
+				++m_count;
+				break;
+			}
+		
+			// check for a match
+			int oldbit = 1 << j;
+			bool match = ( ( mask & oldbit ) != 0 )
+				&& ( rgba[4*i] == rgba[4*j] )
+				&& ( rgba[4*i + 1] == rgba[4*j + 1] )
+				&& ( rgba[4*i + 2] == rgba[4*j + 2] )
+				&& ( rgba[4*j + 3] >= 128 || !isDxt1 );
+			if( match )
+			{
+				// get the index of the match
+				int index = m_remap[j];
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// map to this point and increase the weight
+				m_weights[index] += ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = index;
+				break;
+			}
+		}
+	}
+
+	// square root the weights
+	for( int i = 0; i < m_count; ++i )
+		m_weights[i] = std::sqrt( m_weights[i] );
+}
+
+void ColourSet::RemapIndices( u8 const* source, u8* target ) const
+{
+	for( int i = 0; i < 16; ++i )
+	{
+		int j = m_remap[i];
+		if( j == -1 )
+			target[i] = 3;
+		else
+			target[i] = source[j];
+	}
+}
+
+} // namespace squish
--- a/colourset.h
+++ b/colourset.h
@@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURSET_H
+#define SQUISH_COLOURSET_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+/*! @brief Represents a set of block colours
+*/
+class ColourSet
+{
+public:
+	ColourSet( u8 const* rgba, int mask, int flags );
+
+	int GetCount() const { return m_count; }
+	Vec3 const* GetPoints() const { return m_points; }
+	float const* GetWeights() const { return m_weights; }
+	bool IsTransparent() const { return m_transparent; }
+
+	void RemapIndices( u8 const* source, u8* target ) const;
+
+private:
+	int m_count;
+	Vec3 m_points[16];
+	float m_weights[16];
+	int m_remap[16];
+	bool m_transparent;
+};
+
+} // namespace sqish
+
+#endif // ndef SQUISH_COLOURSET_H
--- a/config.h
+++ b/config.h
@@ -0,0 +1,49 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CONFIG_H
+#define SQUISH_CONFIG_H
+
+// Set to 1 when building squish to use Altivec instructions.
+#ifndef SQUISH_USE_ALTIVEC
+#define SQUISH_USE_ALTIVEC 0
+#endif
+
+// Set to 1 or 2 when building squish to use SSE or SSE2 instructions.
+#ifndef SQUISH_USE_SSE
+#define SQUISH_USE_SSE 0
+#endif
+
+// Internally set SQUISH_USE_SIMD when either Altivec or SSE is available.
+#if SQUISH_USE_ALTIVEC && SQUISH_USE_SSE
+#error "Cannot enable both Altivec and SSE!"
+#endif
+#if SQUISH_USE_ALTIVEC || SQUISH_USE_SSE
+#define SQUISH_USE_SIMD 1
+#else
+#define SQUISH_USE_SIMD 0
+#endif
+
+#endif // ndef SQUISH_CONFIG_H
--- a/maths.cpp
+++ b/maths.cpp
@@ -0,0 +1,259 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+/*! @file
+
+	The symmetric eigensystem solver algorithm is from 
+	http://www.geometrictools.com/Documentation/EigenSymmetric3x3.pdf
+*/
+
+#include "maths.h"
+#include "simd.h"
+#include <cfloat>
+
+namespace squish {
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights )
+{
+	// compute the centroid
+	float total = 0.0f;
+	Vec3 centroid( 0.0f );
+	for( int i = 0; i < n; ++i )
+	{
+		total += weights[i];
+		centroid += weights[i]*points[i];
+	}
+	if( total > FLT_EPSILON )
+		centroid /= total;
+
+	// accumulate the covariance matrix
+	Sym3x3 covariance( 0.0f );
+	for( int i = 0; i < n; ++i )
+	{
+		Vec3 a = points[i] - centroid;
+		Vec3 b = weights[i]*a;
+		
+		covariance[0] += a.X()*b.X();
+		covariance[1] += a.X()*b.Y();
+		covariance[2] += a.X()*b.Z();
+		covariance[3] += a.Y()*b.Y();
+		covariance[4] += a.Y()*b.Z();
+		covariance[5] += a.Z()*b.Z();
+	}
+	
+	// return it
+	return covariance;
+}
+
+#if 0
+
+static Vec3 GetMultiplicity1Evector( Sym3x3 const& matrix, float evalue )
+{
+	// compute M
+	Sym3x3 m;
+	m[0] = matrix[0] - evalue;
+	m[1] = matrix[1];
+	m[2] = matrix[2];
+	m[3] = matrix[3] - evalue;
+	m[4] = matrix[4];
+	m[5] = matrix[5] - evalue;
+
+	// compute U
+	Sym3x3 u;
+	u[0] = m[3]*m[5] - m[4]*m[4];
+	u[1] = m[2]*m[4] - m[1]*m[5];
+	u[2] = m[1]*m[4] - m[2]*m[3];
+	u[3] = m[0]*m[5] - m[2]*m[2];
+	u[4] = m[1]*m[2] - m[4]*m[0];
+	u[5] = m[0]*m[3] - m[1]*m[1];
+
+	// find the largest component
+	float mc = std::fabs( u[0] );
+	int mi = 0;
+	for( int i = 1; i < 6; ++i )
+	{
+		float c = std::fabs( u[i] );
+		if( c > mc )
+		{
+			mc = c;
+			mi = i;
+		}
+	}
+
+	// pick the column with this component
+	switch( mi )
+	{
+	case 0:
+		return Vec3( u[0], u[1], u[2] );
+
+	case 1:
+	case 3:
+		return Vec3( u[1], u[3], u[4] );
+
+	default:
+		return Vec3( u[2], u[4], u[5] );
+	}
+}
+
+static Vec3 GetMultiplicity2Evector( Sym3x3 const& matrix, float evalue )
+{
+	// compute M
+	Sym3x3 m;
+	m[0] = matrix[0] - evalue;
+	m[1] = matrix[1];
+	m[2] = matrix[2];
+	m[3] = matrix[3] - evalue;
+	m[4] = matrix[4];
+	m[5] = matrix[5] - evalue;
+
+	// find the largest component
+	float mc = std::fabs( m[0] );
+	int mi = 0;
+	for( int i = 1; i < 6; ++i )
+	{
+		float c = std::fabs( m[i] );
+		if( c > mc )
+		{
+			mc = c;
+			mi = i;
+		}
+	}
+
+	// pick the first eigenvector based on this index
+	switch( mi )
+	{
+	case 0:
+	case 1:
+		return Vec3( -m[1], m[0], 0.0f );
+
+	case 2:
+		return Vec3( m[2], 0.0f, -m[0] );
+
+	case 3:
+	case 4:
+		return Vec3( 0.0f, -m[4], m[3] );
+
+	default:
+		return Vec3( 0.0f, -m[5], m[4] );
+	}
+}
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+	// compute the cubic coefficients
+	float c0 = matrix[0]*matrix[3]*matrix[5] 
+		+ 2.0f*matrix[1]*matrix[2]*matrix[4] 
+		- matrix[0]*matrix[4]*matrix[4] 
+		- matrix[3]*matrix[2]*matrix[2] 
+		- matrix[5]*matrix[1]*matrix[1];
+	float c1 = matrix[0]*matrix[3] + matrix[0]*matrix[5] + matrix[3]*matrix[5]
+		- matrix[1]*matrix[1] - matrix[2]*matrix[2] - matrix[4]*matrix[4];
+	float c2 = matrix[0] + matrix[3] + matrix[5];
+
+	// compute the quadratic coefficients
+	float a = c1 - ( 1.0f/3.0f )*c2*c2;
+	float b = ( -2.0f/27.0f )*c2*c2*c2 + ( 1.0f/3.0f )*c1*c2 - c0;
+
+	// compute the root count check
+	float Q = 0.25f*b*b + ( 1.0f/27.0f )*a*a*a;
+
+	// test the multiplicity
+	if( FLT_EPSILON < Q )
+	{
+		// only one root, which implies we have a multiple of the identity
+        return Vec3( 1.0f );
+	}
+	else if( Q < -FLT_EPSILON )
+	{
+		// three distinct roots
+		float theta = std::atan2( std::sqrt( -Q ), -0.5f*b );
+		float rho = std::sqrt( 0.25f*b*b - Q );
+
+		float rt = std::pow( rho, 1.0f/3.0f );
+		float ct = std::cos( theta/3.0f );
+		float st = std::sin( theta/3.0f );
+
+		float l1 = ( 1.0f/3.0f )*c2 + 2.0f*rt*ct;
+		float l2 = ( 1.0f/3.0f )*c2 - rt*( ct + ( float )sqrt( 3.0f )*st );
+		float l3 = ( 1.0f/3.0f )*c2 - rt*( ct - ( float )sqrt( 3.0f )*st );
+
+		// pick the larger
+		if( std::fabs( l2 ) > std::fabs( l1 ) )
+			l1 = l2;
+		if( std::fabs( l3 ) > std::fabs( l1 ) )
+			l1 = l3;
+
+		// get the eigenvector
+		return GetMultiplicity1Evector( matrix, l1 );
+	}
+	else // if( -FLT_EPSILON <= Q && Q <= FLT_EPSILON )
+	{
+		// two roots
+		float rt;
+		if( b < 0.0f )
+			rt = -std::pow( -0.5f*b, 1.0f/3.0f );
+		else
+			rt = std::pow( 0.5f*b, 1.0f/3.0f );
+		
+		float l1 = ( 1.0f/3.0f )*c2 + rt;		// repeated
+		float l2 = ( 1.0f/3.0f )*c2 - 2.0f*rt;
+		
+		// get the eigenvector
+		if( std::fabs( l1 ) > std::fabs( l2 ) )
+			return GetMultiplicity2Evector( matrix, l1 );
+		else
+			return GetMultiplicity1Evector( matrix, l2 );
+	}
+}
+
+#else
+
+#define POWER_ITERATION_COUNT 	8
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+	Vec4 const row0( matrix[0], matrix[1], matrix[2], 0.0f );
+	Vec4 const row1( matrix[1], matrix[3], matrix[4], 0.0f );
+	Vec4 const row2( matrix[2], matrix[4], matrix[5], 0.0f );
+	Vec4 v = VEC4_CONST( 1.0f );
+	for( int i = 0; i < POWER_ITERATION_COUNT; ++i )
+	{
+		// matrix multiply
+		Vec4 w = row0*v.SplatX();
+		w = MultiplyAdd(row1, v.SplatY(), w);
+		w = MultiplyAdd(row2, v.SplatZ(), w);
+
+		// get max component from xyz in all channels
+		Vec4 a = Max(w.SplatX(), Max(w.SplatY(), w.SplatZ()));
+
+		// divide through and advance
+		v = w*Reciprocal(a);
+	}
+	return v.GetVec3();
+}
+
+#endif
+
+} // namespace squish
--- a/maths.h
+++ b/maths.h
@@ -0,0 +1,233 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_MATHS_H
+#define SQUISH_MATHS_H
+
+#include <cmath>
+#include <algorithm>
+#include "config.h"
+
+namespace squish {
+
+class Vec3
+{
+public:
+	typedef Vec3 const& Arg;
+
+	Vec3()
+	{
+	}
+
+	explicit Vec3( float s )
+	{
+		m_x = s;
+		m_y = s;
+		m_z = s;
+	}
+
+	Vec3( float x, float y, float z )
+	{
+		m_x = x;
+		m_y = y;
+		m_z = z;
+	}
+	
+	float X() const { return m_x; }
+	float Y() const { return m_y; }
+	float Z() const { return m_z; }
+	
+	Vec3 operator-() const
+	{
+		return Vec3( -m_x, -m_y, -m_z );
+	}
+	
+	Vec3& operator+=( Arg v )
+	{
+		m_x += v.m_x;
+		m_y += v.m_y;
+		m_z += v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator-=( Arg v )
+	{
+		m_x -= v.m_x;
+		m_y -= v.m_y;
+		m_z -= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator*=( Arg v )
+	{
+		m_x *= v.m_x;
+		m_y *= v.m_y;
+		m_z *= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator*=( float s )
+	{
+		m_x *= s;
+		m_y *= s;
+		m_z *= s;
+		return *this;
+	}
+	
+	Vec3& operator/=( Arg v )
+	{
+		m_x /= v.m_x;
+		m_y /= v.m_y;
+		m_z /= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator/=( float s )
+	{
+		float t = 1.0f/s;
+		m_x *= t;
+		m_y *= t;
+		m_z *= t;
+		return *this;
+	}
+	
+	friend Vec3 operator+( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy += right;
+	}
+	
+	friend Vec3 operator-( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy -= right;
+	}
+	
+	friend Vec3 operator*( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy *= right;
+	}
+	
+	friend Vec3 operator*( Arg left, float right )
+	{
+		Vec3 copy( left );
+		return copy *= right;
+	}
+	
+	friend Vec3 operator*( float left, Arg right )
+	{
+		Vec3 copy( right );
+		return copy *= left;
+	}
+	
+	friend Vec3 operator/( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy /= right;
+	}
+	
+	friend Vec3 operator/( Arg left, float right )
+	{
+		Vec3 copy( left );
+		return copy /= right;
+	}
+	
+	friend float Dot( Arg left, Arg right )
+	{
+		return left.m_x*right.m_x + left.m_y*right.m_y + left.m_z*right.m_z;
+	}
+	
+	friend Vec3 Min( Arg left, Arg right )
+	{
+		return Vec3(
+			std::min( left.m_x, right.m_x ), 
+			std::min( left.m_y, right.m_y ), 
+			std::min( left.m_z, right.m_z )
+		);
+	}
+
+	friend Vec3 Max( Arg left, Arg right )
+	{
+		return Vec3(
+			std::max( left.m_x, right.m_x ), 
+			std::max( left.m_y, right.m_y ), 
+			std::max( left.m_z, right.m_z )
+		);
+	}
+
+	friend Vec3 Truncate( Arg v )
+	{
+		return Vec3(
+			v.m_x > 0.0f ? std::floor( v.m_x ) : std::ceil( v.m_x ), 
+			v.m_y > 0.0f ? std::floor( v.m_y ) : std::ceil( v.m_y ), 
+			v.m_z > 0.0f ? std::floor( v.m_z ) : std::ceil( v.m_z )
+		);
+	}
+
+private:
+	float m_x;
+	float m_y;
+	float m_z;
+};
+
+inline float LengthSquared( Vec3::Arg v )
+{
+	return Dot( v, v );
+}
+
+class Sym3x3
+{
+public:
+	Sym3x3()
+	{
+	}
+
+	Sym3x3( float s )
+	{
+		for( int i = 0; i < 6; ++i )
+			m_x[i] = s;
+	}
+
+	float operator[]( int index ) const
+	{
+		return m_x[index];
+	}
+
+	float& operator[]( int index )
+	{
+		return m_x[index];
+	}
+
+private:
+	float m_x[6];
+};
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights );
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix );
+
+} // namespace squish
+
+#endif // ndef SQUISH_MATHS_H
--- a/rangefit.cpp
+++ b/rangefit.cpp
@@ -0,0 +1,201 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "rangefit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+RangeFit::RangeFit( ColourSet const* colours, int flags, float* metric ) 
+  : ColourFit( colours, flags )
+{
+	// initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+	if( metric )
+		m_metric = Vec3( metric[0], metric[1], metric[2] );
+	else
+		m_metric = Vec3( 1.0f );	
+
+	// initialise the best error
+	m_besterror = FLT_MAX;
+
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, weights );
+	
+	// compute the principle component
+	Vec3 principle = ComputePrincipleComponent( covariance );
+
+	// get the min and max range as the codebook endpoints
+	Vec3 start( 0.0f );
+	Vec3 end( 0.0f );
+	if( count > 0 )
+	{
+		float min, max;
+		
+		// compute the range
+		start = end = values[0];
+		min = max = Dot( values[0], principle );
+		for( int i = 1; i < count; ++i )
+		{
+			float val = Dot( values[i], principle );
+			if( val < min )
+			{
+				start = values[i];
+				min = val;
+			}
+			else if( val > max )
+			{
+				end = values[i];
+				max = val;
+			}
+		}
+	}
+			
+	// clamp the output to [0, 1]
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	start = Min( one, Max( zero, start ) );
+	end = Min( one, Max( zero, end ) );
+
+	// clamp to the grid and save
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+	Vec3 const half( 0.5f );
+	m_start = Truncate( grid*start + half )*gridrcp;
+	m_end = Truncate( grid*end + half )*gridrcp;
+}
+
+void RangeFit::Compress3( void* block )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// create a codebook
+	Vec3 codes[3];
+	codes[0] = m_start;
+	codes[1] = m_end;
+	codes[2] = 0.5f*m_start + 0.5f*m_end;
+
+	// match each point to the closest code
+	u8 closest[16];
+	float error = 0.0f;
+	for( int i = 0; i < count; ++i )
+	{
+		// find the closest code
+		float dist = FLT_MAX;
+		int idx = 0;
+		for( int j = 0; j < 3; ++j )
+		{
+			float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+			if( d < dist )
+			{
+				dist = d;
+				idx = j;
+			}
+		}
+		
+		// save the index
+		closest[i] = ( u8 )idx;
+		
+		// accumulate the error
+		error += dist;
+	}
+	
+	// save this scheme if it wins
+	if( error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( closest, indices );
+		
+		// save the block
+		WriteColourBlock3( m_start, m_end, indices, block );
+		
+		// save the error
+		m_besterror = error;
+	}
+}
+
+void RangeFit::Compress4( void* block )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// create a codebook
+	Vec3 codes[4];
+	codes[0] = m_start;
+	codes[1] = m_end;
+	codes[2] = ( 2.0f/3.0f )*m_start + ( 1.0f/3.0f )*m_end;
+	codes[3] = ( 1.0f/3.0f )*m_start + ( 2.0f/3.0f )*m_end;
+
+	// match each point to the closest code
+	u8 closest[16];
+	float error = 0.0f;
+	for( int i = 0; i < count; ++i )
+	{
+		// find the closest code
+		float dist = FLT_MAX;
+		int idx = 0;
+		for( int j = 0; j < 4; ++j )
+		{
+			float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+			if( d < dist )
+			{
+				dist = d;
+				idx = j;
+			}
+		}
+		
+		// save the index
+		closest[i] = ( u8 )idx;
+		
+		// accumulate the error
+		error += dist;
+	}
+	
+	// save this scheme if it wins
+	if( error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( closest, indices );
+		
+		// save the block
+		WriteColourBlock4( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = error;
+	}
+}
+
+} // namespace squish
--- a/rangefit.h
+++ b/rangefit.h
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_RANGEFIT_H
+#define SQUISH_RANGEFIT_H
+
+#include "squish.h"
+#include "colourfit.h"
+#include "maths.h"
+
+namespace squish {
+
+class ColourSet;
+
+class RangeFit : public ColourFit
+{
+public:
+	RangeFit( ColourSet const* colours, int flags, float* metric );
+	
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+	Vec3 m_metric;
+	Vec3 m_start;
+	Vec3 m_end;
+	float m_besterror;
+};
+
+} // squish
+
+#endif // ndef SQUISH_RANGEFIT_H
--- a/simd.h
+++ b/simd.h
@@ -0,0 +1,40 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_H
+#define SQUISH_SIMD_H
+
+#include "maths.h"
+
+#if SQUISH_USE_ALTIVEC
+#include "simd_ve.h"
+#elif SQUISH_USE_SSE
+#include "simd_sse.h"
+#else
+#include "simd_float.h"
+#endif
+
+
+#endif // ndef SQUISH_SIMD_H
--- a/simd_float.h
+++ b/simd_float.h
@@ -0,0 +1,183 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_FLOAT_H
+#define SQUISH_SIMD_FLOAT_H
+
+#include <algorithm>
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( X )
+
+class Vec4
+{
+public:
+	typedef Vec4 const& Arg;
+
+	Vec4() {}
+		
+	explicit Vec4( float s )
+	  : m_x( s ),
+		m_y( s ),
+		m_z( s ),
+		m_w( s )
+	{
+	}
+	
+	Vec4( float x, float y, float z, float w )
+	  : m_x( x ),
+		m_y( y ),
+		m_z( z ),
+		m_w( w )
+	{
+	}
+	
+	Vec3 GetVec3() const
+	{
+		return Vec3( m_x, m_y, m_z );
+	}
+	
+	Vec4 SplatX() const { return Vec4( m_x ); }
+	Vec4 SplatY() const { return Vec4( m_y ); }
+	Vec4 SplatZ() const { return Vec4( m_z ); }
+	Vec4 SplatW() const { return Vec4( m_w ); }
+
+	Vec4& operator+=( Arg v )
+	{
+		m_x += v.m_x;
+		m_y += v.m_y;
+		m_z += v.m_z;
+		m_w += v.m_w;
+		return *this;
+	}
+	
+	Vec4& operator-=( Arg v )
+	{
+		m_x -= v.m_x;
+		m_y -= v.m_y;
+		m_z -= v.m_z;
+		m_w -= v.m_w;
+		return *this;
+	}
+	
+	Vec4& operator*=( Arg v )
+	{
+		m_x *= v.m_x;
+		m_y *= v.m_y;
+		m_z *= v.m_z;
+		m_w *= v.m_w;
+		return *this;
+	}
+	
+	friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy += right;
+	}
+	
+	friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy -= right;
+	}
+	
+	friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy *= right;
+	}
+	
+	//! Returns a*b + c
+	friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return a*b + c;
+	}
+	
+	//! Returns -( a*b - c )
+	friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return c - a*b;
+	}
+	
+	friend Vec4 Reciprocal( Vec4::Arg v )
+	{
+		return Vec4( 
+			1.0f/v.m_x, 
+			1.0f/v.m_y, 
+			1.0f/v.m_z, 
+			1.0f/v.m_w 
+		);
+	}
+	
+	friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( 
+			std::min( left.m_x, right.m_x ), 
+			std::min( left.m_y, right.m_y ), 
+			std::min( left.m_z, right.m_z ), 
+			std::min( left.m_w, right.m_w ) 
+		);
+	}
+	
+	friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( 
+			std::max( left.m_x, right.m_x ), 
+			std::max( left.m_y, right.m_y ), 
+			std::max( left.m_z, right.m_z ), 
+			std::max( left.m_w, right.m_w ) 
+		);
+	}
+	
+	friend Vec4 Truncate( Vec4::Arg v )
+	{
+		return Vec4(
+			v.m_x > 0.0f ? std::floor( v.m_x ) : std::ceil( v.m_x ), 
+			v.m_y > 0.0f ? std::floor( v.m_y ) : std::ceil( v.m_y ), 
+			v.m_z > 0.0f ? std::floor( v.m_z ) : std::ceil( v.m_z ),
+			v.m_w > 0.0f ? std::floor( v.m_w ) : std::ceil( v.m_w )
+		);
+	}
+	
+	friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right ) 
+	{
+		return left.m_x < right.m_x
+			|| left.m_y < right.m_y
+			|| left.m_z < right.m_z
+			|| left.m_w < right.m_w;
+	}
+	
+private:
+	float m_x;
+	float m_y;
+	float m_z;
+	float m_w;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_FLOAT_H
+
--- a/simd_sse.h
+++ b/simd_sse.h
@@ -0,0 +1,180 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_SSE_H
+#define SQUISH_SIMD_SSE_H
+
+#include <xmmintrin.h>
+#if ( SQUISH_USE_SSE > 1 )
+#include <emmintrin.h>
+#endif
+
+#define SQUISH_SSE_SPLAT( a )										\
+	( ( a ) | ( ( a ) << 2 ) | ( ( a ) << 4 ) | ( ( a ) << 6 ) )
+
+#define SQUISH_SSE_SHUF( x, y, z, w )								\
+	( ( x ) | ( ( y ) << 2 ) | ( ( z ) << 4 ) | ( ( w ) << 6 ) )
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( X )
+
+class Vec4
+{
+public:
+	typedef Vec4 const& Arg;
+
+	Vec4() {}
+		
+	explicit Vec4( __m128 v ) : m_v( v ) {}
+	
+	Vec4( Vec4 const& arg ) : m_v( arg.m_v ) {}
+	
+	Vec4& operator=( Vec4 const& arg )
+	{
+		m_v = arg.m_v;
+		return *this;
+	}
+	
+	explicit Vec4( float s ) : m_v( _mm_set1_ps( s ) ) {}
+	
+	Vec4( float x, float y, float z, float w ) : m_v( _mm_setr_ps( x, y, z, w ) ) {}
+	
+	Vec3 GetVec3() const
+	{
+#ifdef __GNUC__
+		__attribute__ ((__aligned__ (16))) float c[4];
+#else
+		__declspec(align(16)) float c[4];
+#endif
+		_mm_store_ps( c, m_v );
+		return Vec3( c[0], c[1], c[2] );
+	}
+	
+	Vec4 SplatX() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) ) ); }
+	Vec4 SplatY() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) ) ); }
+	Vec4 SplatZ() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 2 ) ) ); }
+	Vec4 SplatW() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 3 ) ) ); }
+
+	Vec4& operator+=( Arg v )
+	{
+		m_v = _mm_add_ps( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator-=( Arg v )
+	{
+		m_v = _mm_sub_ps( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator*=( Arg v )
+	{
+		m_v = _mm_mul_ps( m_v, v.m_v );
+		return *this;
+	}
+	
+	friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( _mm_add_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( _mm_sub_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( _mm_mul_ps( left.m_v, right.m_v ) );
+	}
+	
+	//! Returns a*b + c
+	friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( _mm_add_ps( _mm_mul_ps( a.m_v, b.m_v ), c.m_v ) );
+	}
+	
+	//! Returns -( a*b - c )
+	friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( _mm_sub_ps( c.m_v, _mm_mul_ps( a.m_v, b.m_v ) ) );
+	}
+	
+	friend Vec4 Reciprocal( Vec4::Arg v )
+	{
+		// get the reciprocal estimate
+		__m128 estimate = _mm_rcp_ps( v.m_v );
+
+		// one round of Newton-Rhaphson refinement
+		__m128 diff = _mm_sub_ps( _mm_set1_ps( 1.0f ), _mm_mul_ps( estimate, v.m_v ) );
+		return Vec4( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
+	}
+	
+	friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( _mm_min_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( _mm_max_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Truncate( Vec4::Arg v )
+	{
+#if ( SQUISH_USE_SSE == 1 )
+		// convert to ints
+		__m128 input = v.m_v;
+		__m64 lo = _mm_cvttps_pi32( input );
+		__m64 hi = _mm_cvttps_pi32( _mm_movehl_ps( input, input ) );
+
+		// convert to floats
+		__m128 part = _mm_movelh_ps( input, _mm_cvtpi32_ps( input, hi ) );
+		__m128 truncated = _mm_cvtpi32_ps( part, lo );
+		
+		// clear out the MMX multimedia state to allow FP calls later
+		_mm_empty(); 
+		return Vec4( truncated );
+#else
+		// use SSE2 instructions
+		return Vec4( _mm_cvtepi32_ps( _mm_cvttps_epi32( v.m_v ) ) );
+#endif
+	}
+	
+	friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right ) 
+	{
+		__m128 bits = _mm_cmplt_ps( left.m_v, right.m_v );
+		int value = _mm_movemask_ps( bits );
+		return value != 0;
+	}
+	
+private:
+	__m128 m_v;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_SSE_H
--- a/simd_ve.h
+++ b/simd_ve.h
@@ -0,0 +1,166 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_VE_H
+#define SQUISH_SIMD_VE_H
+
+#include <altivec.h>
+#undef bool
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( ( vector float ){ X } )
+
+class Vec4
+{
+public:
+	typedef Vec4 Arg;
+
+	Vec4() {}
+		
+	explicit Vec4( vector float v ) : m_v( v ) {}
+	
+	Vec4( Vec4 const& arg ) : m_v( arg.m_v ) {}
+	
+	Vec4& operator=( Vec4 const& arg )
+	{
+		m_v = arg.m_v;
+		return *this;
+	}
+	
+	explicit Vec4( float s )
+	{
+		union { vector float v; float c[4]; } u;
+		u.c[0] = s;
+		u.c[1] = s;
+		u.c[2] = s;
+		u.c[3] = s;
+		m_v = u.v;
+	}
+	
+	Vec4( float x, float y, float z, float w )
+	{
+		union { vector float v; float c[4]; } u;
+		u.c[0] = x;
+		u.c[1] = y;
+		u.c[2] = z;
+		u.c[3] = w;
+		m_v = u.v;
+	}
+	
+	Vec3 GetVec3() const
+	{
+		union { vector float v; float c[4]; } u;
+		u.v = m_v;
+		return Vec3( u.c[0], u.c[1], u.c[2] );
+	}
+	
+	Vec4 SplatX() const { return Vec4( vec_splat( m_v, 0 ) ); }
+	Vec4 SplatY() const { return Vec4( vec_splat( m_v, 1 ) ); }
+	Vec4 SplatZ() const { return Vec4( vec_splat( m_v, 2 ) ); }
+	Vec4 SplatW() const { return Vec4( vec_splat( m_v, 3 ) ); }
+
+	Vec4& operator+=( Arg v )
+	{
+		m_v = vec_add( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator-=( Arg v )
+	{
+		m_v = vec_sub( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator*=( Arg v )
+	{
+		m_v = vec_madd( m_v, v.m_v, ( vector float ){ -0.0f } );
+		return *this;
+	}
+	
+	friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( vec_add( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( vec_sub( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( vec_madd( left.m_v, right.m_v, ( vector float ){ -0.0f } ) );
+	}
+	
+	//! Returns a*b + c
+	friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( vec_madd( a.m_v, b.m_v, c.m_v ) );
+	}
+	
+	//! Returns -( a*b - c )
+	friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( vec_nmsub( a.m_v, b.m_v, c.m_v ) );
+	}
+	
+	friend Vec4 Reciprocal( Vec4::Arg v )
+	{
+		// get the reciprocal estimate
+		vector float estimate = vec_re( v.m_v );
+		
+		// one round of Newton-Rhaphson refinement
+		vector float diff = vec_nmsub( estimate, v.m_v, ( vector float ){ 1.0f } );
+		return Vec4( vec_madd( diff, estimate, estimate ) );
+	}
+	
+	friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( vec_min( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( vec_max( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Truncate( Vec4::Arg v )
+	{
+		return Vec4( vec_trunc( v.m_v ) );
+	}
+	
+	friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right ) 
+	{
+		return vec_any_lt( left.m_v, right.m_v ) != 0;
+	}
+	
+private:
+	vector float m_v;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_VE_H
--- a/singlecolourfit.cpp
+++ b/singlecolourfit.cpp
@@ -0,0 +1,172 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "singlecolourfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+
+namespace squish {
+
+struct SourceBlock
+{
+	u8 start;
+	u8 end;
+	u8 error;
+};
+
+struct SingleColourLookup
+{
+	SourceBlock sources[2];
+};
+
+#include "singlecolourlookup.inl"
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+SingleColourFit::SingleColourFit( ColourSet const* colours, int flags )
+  : ColourFit( colours, flags )
+{
+	// grab the single colour
+	Vec3 const* values = m_colours->GetPoints();
+	m_colour[0] = ( u8 )FloatToInt( 255.0f*values->X(), 255 );
+	m_colour[1] = ( u8 )FloatToInt( 255.0f*values->Y(), 255 );
+	m_colour[2] = ( u8 )FloatToInt( 255.0f*values->Z(), 255 );
+		
+	// initialise the best error
+	m_besterror = INT_MAX;
+}
+
+void SingleColourFit::Compress3( void* block )
+{
+	// build the table of lookups
+	SingleColourLookup const* const lookups[] = 
+	{
+		lookup_5_3, 
+		lookup_6_3, 
+		lookup_5_3
+	};
+	
+	// find the best end-points and index
+	ComputeEndPoints( lookups );
+	
+	// build the block if we win
+	if( m_error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( &m_index, indices );
+		
+		// save the block
+		WriteColourBlock3( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = m_error;
+	}
+}
+
+void SingleColourFit::Compress4( void* block )
+{
+	// build the table of lookups
+	SingleColourLookup const* const lookups[] = 
+	{
+		lookup_5_4, 
+		lookup_6_4, 
+		lookup_5_4
+	};
+	
+	// find the best end-points and index
+	ComputeEndPoints( lookups );
+	
+	// build the block if we win
+	if( m_error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( &m_index, indices );
+		
+		// save the block
+		WriteColourBlock4( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = m_error;
+	}
+}
+
+void SingleColourFit::ComputeEndPoints( SingleColourLookup const* const* lookups )
+{
+	// check each index combination (endpoint or intermediate)
+	m_error = INT_MAX;
+	for( int index = 0; index < 2; ++index )
+	{
+		// check the error for this codebook index
+		SourceBlock const* sources[3];
+		int error = 0;
+		for( int channel = 0; channel < 3; ++channel )
+		{
+			// grab the lookup table and index for this channel
+			SingleColourLookup const* lookup = lookups[channel];
+			int target = m_colour[channel];
+			
+			// store a pointer to the source for this channel
+			sources[channel] = lookup[target].sources + index;
+			
+			// accumulate the error
+			int diff = sources[channel]->error;
+			error += diff*diff;			
+		}
+		
+		// keep it if the error is lower
+		if( error < m_error )
+		{
+			m_start = Vec3(
+				( float )sources[0]->start/31.0f, 
+				( float )sources[1]->start/63.0f, 
+				( float )sources[2]->start/31.0f
+			);
+			m_end = Vec3(
+				( float )sources[0]->end/31.0f, 
+				( float )sources[1]->end/63.0f, 
+				( float )sources[2]->end/31.0f
+			);
+			m_index = ( u8 )( 2*index );
+			m_error = error;
+		}
+	}
+}
+
+} // namespace squish
--- a/singlecolourfit.h
+++ b/singlecolourfit.h
@@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SINGLECOLOURFIT_H
+#define SQUISH_SINGLECOLOURFIT_H
+
+#include "squish.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ColourSet;
+struct SingleColourLookup;
+
+class SingleColourFit : public ColourFit
+{
+public:
+	SingleColourFit( ColourSet const* colours, int flags );
+	
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+	void ComputeEndPoints( SingleColourLookup const* const* lookups );
+	
+	u8 m_colour[3];
+	Vec3 m_start;
+	Vec3 m_end;
+	u8 m_index;
+	int m_error;
+	int m_besterror;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SINGLECOLOURFIT_H
--- a/squish.cpp
+++ b/squish.cpp
@@ -0,0 +1,230 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include <squish.h>
+#include "colourset.h"
+#include "maths.h"
+#include "rangefit.h"
+#include "clusterfit.h"
+#include "colourblock.h"
+#include "alpha.h"
+#include "singlecolourfit.h"
+
+namespace squish {
+
+static int FixFlags( int flags )
+{
+	// grab the flag bits
+	int method = flags & ( kDxt1 | kDxt3 | kDxt5 );
+	int fit = flags & ( kColourIterativeClusterFit | kColourClusterFit | kColourRangeFit );
+	int extra = flags & kWeightColourByAlpha;
+	
+	// set defaults
+	if( method != kDxt3 && method != kDxt5 )
+		method = kDxt1;
+	if( fit != kColourRangeFit && fit != kColourIterativeClusterFit )
+		fit = kColourClusterFit;
+		
+	// done
+	return method | fit | extra;
+}
+
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// get the block locations
+	void* colourBlock = block;
+	void* alphaBock = block;
+	if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+		colourBlock = reinterpret_cast< u8* >( block ) + 8;
+
+	// create the minimal point set
+	ColourSet colours( rgba, mask, flags );
+	
+	// check the compression type and compress colour
+	if( colours.GetCount() == 1 )
+	{
+		// always do a single colour fit
+		SingleColourFit fit( &colours, flags );
+		fit.Compress( colourBlock );
+	}
+	else if( ( flags & kColourRangeFit ) != 0 || colours.GetCount() == 0 )
+	{
+		// do a range fit
+		RangeFit fit( &colours, flags, metric );
+		fit.Compress( colourBlock );
+	}
+	else
+	{
+		// default to a cluster fit (could be iterative or not)
+		ClusterFit fit( &colours, flags, metric );
+		fit.Compress( colourBlock );
+	}
+	
+	// compress alpha separately if necessary
+	if( ( flags & kDxt3 ) != 0 )
+		CompressAlphaDxt3( rgba, mask, alphaBock );
+	else if( ( flags & kDxt5 ) != 0 )
+		CompressAlphaDxt5( rgba, mask, alphaBock );
+}
+
+void Decompress( u8* rgba, void const* block, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// get the block locations
+	void const* colourBlock = block;
+	void const* alphaBock = block;
+	if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+		colourBlock = reinterpret_cast< u8 const* >( block ) + 8;
+
+	// decompress colour
+	DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
+
+	// decompress alpha separately if necessary
+	if( ( flags & kDxt3 ) != 0 )
+		DecompressAlphaDxt3( rgba, alphaBock );
+	else if( ( flags & kDxt5 ) != 0 )
+		DecompressAlphaDxt5( rgba, alphaBock );
+}
+
+int GetStorageRequirements( int width, int height, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+	
+	// compute the storage requirements
+	int blockcount = ( ( width + 3 )/4 ) * ( ( height + 3 )/4 );
+	int blocksize = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+	return blockcount*blocksize;	
+}
+
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// initialise the block output
+	u8* targetBlock = reinterpret_cast< u8* >( blocks );
+	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+
+	// loop over blocks
+	for( int y = 0; y < height; y += 4 )
+	{
+		for( int x = 0; x < width; x += 4 )
+		{
+			// build the 4x4 block of pixels
+			u8 sourceRgba[16*4];
+			u8* targetPixel = sourceRgba;
+			int mask = 0;
+			for( int py = 0; py < 4; ++py )
+			{
+				for( int px = 0; px < 4; ++px )
+				{
+					// get the source pixel in the image
+					int sx = x + px;
+					int sy = y + py;
+					
+					// enable if we're in the image
+					if( sx < width && sy < height )
+					{
+						// copy the rgba value
+						u8 const* sourcePixel = rgba + 4*( width*sy + sx );
+						for( int i = 0; i < 4; ++i )
+							*targetPixel++ = *sourcePixel++;
+							
+						// enable this pixel
+						mask |= ( 1 << ( 4*py + px ) );
+					}
+					else
+					{
+						// skip this pixel as its outside the image
+						targetPixel += 4;
+					}
+				}
+			}
+			
+			// compress it into the output
+			CompressMasked( sourceRgba, mask, targetBlock, flags, metric );
+			
+			// advance
+			targetBlock += bytesPerBlock;
+		}
+	}
+}
+
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// initialise the block input
+	u8 const* sourceBlock = reinterpret_cast< u8 const* >( blocks );
+	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+
+	// loop over blocks
+	for( int y = 0; y < height; y += 4 )
+	{
+		for( int x = 0; x < width; x += 4 )
+		{
+			// decompress the block
+			u8 targetRgba[4*16];
+			Decompress( targetRgba, sourceBlock, flags );
+			
+			// write the decompressed pixels to the correct image locations
+			u8 const* sourcePixel = targetRgba;
+			for( int py = 0; py < 4; ++py )
+			{
+				for( int px = 0; px < 4; ++px )
+				{
+					// get the target location
+					int sx = x + px;
+					int sy = y + py;
+					if( sx < width && sy < height )
+					{
+						u8* targetPixel = rgba + 4*( width*sy + sx );
+						
+						// copy the rgba value
+						for( int i = 0; i < 4; ++i )
+							*targetPixel++ = *sourcePixel++;
+					}
+					else
+					{
+						// skip this pixel as its outside the image
+						sourcePixel += 4;
+					}
+				}
+			}
+			
+			// advance
+			sourceBlock += bytesPerBlock;
+		}
+	}
+}
+
+} // namespace squish
--- a/squish.h
+++ b/squish.h
@@ -0,0 +1,263 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_H
+#define SQUISH_H
+
+//! All squish API functions live in this namespace.
+namespace squish {
+
+// -----------------------------------------------------------------------------
+
+//! Typedef a quantity that is a single unsigned byte.
+typedef unsigned char u8;
+
+// -----------------------------------------------------------------------------
+
+enum
+{
+	//! Use DXT1 compression.
+	kDxt1 = ( 1 << 0 ), 
+	
+	//! Use DXT3 compression.
+	kDxt3 = ( 1 << 1 ), 
+	
+	//! Use DXT5 compression.
+	kDxt5 = ( 1 << 2 ), 
+	
+	//! Use a very slow but very high quality colour compressor.
+	kColourIterativeClusterFit = ( 1 << 8 ),	
+	
+	//! Use a slow but high quality colour compressor (the default).
+	kColourClusterFit = ( 1 << 3 ),	
+	
+	//! Use a fast but low quality colour compressor.
+	kColourRangeFit	= ( 1 << 4 ),
+	
+	//! Weight the colour by alpha during cluster fit (disabled by default).
+	kWeightColourByAlpha = ( 1 << 7 )
+};
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+	@param rgba		The rgba values of the 16 source pixels.
+	@param mask		The valid pixel mask.
+	@param block	Storage for the compressed DXT block.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+		
+	The mask parameter enables only certain pixels within the block. The lowest
+	bit enables the first pixel and so on up to the 16th bit. Bits beyond the
+	16th bit are ignored. Pixels that are not enabled are allowed to take
+	arbitrary colours in the output block. An example of how this can be used
+	is in the CompressImage function to disable pixels outside the bounds of
+	the image when the width or height is not divisible by 4.
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for the compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+*/
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric = 0 );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+	@param rgba		The rgba values of the 16 source pixels.
+	@param block	Storage for the compressed DXT block.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for the compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+	
+	This method is an inline that calls CompressMasked with a mask of 0xffff, 
+	provided for compatibility with older versions of squish.
+*/
+inline void Compress( u8 const* rgba, void* block, int flags, float* metric = 0 )
+{
+	CompressMasked( rgba, 0xffff, block, flags, metric );
+}
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses a 4x4 block of pixels.
+
+	@param rgba		Storage for the 16 decompressed pixels.
+	@param block	The compressed DXT block.
+	@param flags	Compression flags.
+
+	The decompressed pixels will be written as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this is:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+*/
+void Decompress( u8* rgba, void const* block, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Computes the amount of compressed storage required.
+
+	@param width	The width of the image.
+	@param height	The height of the image.
+	@param flags	Compression flags.
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+	
+	Most DXT images will be a multiple of 4 in each dimension, but this 
+	function supports arbitrary size images by allowing the outer blocks to
+	be only partially used.
+*/
+int GetStorageRequirements( int width, int height, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses an image in memory.
+
+	@param rgba		The pixels of the source.
+	@param width	The width of the source image.
+	@param height	The height of the source image.
+	@param blocks	Storage for the compressed output.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of width*height
+	rgba values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+		
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for each compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+	
+	Internally this function calls squish::CompressMasked for each block, which 
+	allows for pixels outside the image to take arbitrary values. The function 
+	squish::GetStorageRequirements can be called to compute the amount of memory
+	to allocate for the compressed output.
+*/
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric = 0 );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses an image in memory.
+
+	@param rgba		Storage for the decompressed pixels.
+	@param width	The width of the source image.
+	@param height	The height of the source image.
+	@param blocks	The compressed DXT blocks.
+	@param flags	Compression flags.
+	
+	The decompressed pixels will be written as a contiguous array of width*height
+	16 rgba values, with each component as 1 byte each. In memory this is:
+	
+		{ r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+		
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+
+	Internally this function calls squish::Decompress for each block.
+*/
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags );
+
+// -----------------------------------------------------------------------------
+
+} // namespace squish
+
+#endif // ndef SQUISH_H
+