From 592d3b883158cdcd03fb3d255c81f276a62beb8f Mon Sep 17 00:00:00 2001
From: Jack Andersen <jackoalan@gmail.com>
Date: Sat, 4 Apr 2015 13:19:51 -1000
Subject: [PATCH] Lightweight Commit of libsquish

---
 CMakeLists.txt                   | 104 ++++++++
 CMakeModules/FindlibSquish.cmake |  14 ++
 ChangeLog                        |  52 ++++
 LICENSE                          |  20 ++
 README                           |  35 +++
 alpha.cpp                        | 350 +++++++++++++++++++++++++++
 alpha.h                          |  41 ++++
 clusterfit.cpp                   | 392 +++++++++++++++++++++++++++++++
 clusterfit.h                     |  61 +++++
 colourblock.cpp                  | 214 +++++++++++++++++
 colourblock.h                    |  41 ++++
 colourfit.cpp                    |  54 +++++
 colourfit.h                      |  56 +++++
 colourset.cpp                    | 121 ++++++++++
 colourset.h                      |  58 +++++
 config.h                         |  49 ++++
 maths.cpp                        | 259 ++++++++++++++++++++
 maths.h                          | 233 ++++++++++++++++++
 rangefit.cpp                     | 201 ++++++++++++++++
 rangefit.h                       |  54 +++++
 simd.h                           |  40 ++++
 simd_float.h                     | 183 +++++++++++++++
 simd_sse.h                       | 180 ++++++++++++++
 simd_ve.h                        | 166 +++++++++++++
 singlecolourfit.cpp              | 172 ++++++++++++++
 singlecolourfit.h                |  58 +++++
 squish.cpp                       | 230 ++++++++++++++++++
 squish.h                         | 263 +++++++++++++++++++++
 28 files changed, 3701 insertions(+)
 create mode 100644 CMakeLists.txt
 create mode 100644 CMakeModules/FindlibSquish.cmake
 create mode 100644 ChangeLog
 create mode 100644 LICENSE
 create mode 100644 README
 create mode 100644 alpha.cpp
 create mode 100644 alpha.h
 create mode 100644 clusterfit.cpp
 create mode 100644 clusterfit.h
 create mode 100644 colourblock.cpp
 create mode 100644 colourblock.h
 create mode 100644 colourfit.cpp
 create mode 100644 colourfit.h
 create mode 100644 colourset.cpp
 create mode 100644 colourset.h
 create mode 100644 config.h
 create mode 100644 maths.cpp
 create mode 100644 maths.h
 create mode 100644 rangefit.cpp
 create mode 100644 rangefit.h
 create mode 100644 simd.h
 create mode 100644 simd_float.h
 create mode 100644 simd_sse.h
 create mode 100644 simd_ve.h
 create mode 100644 singlecolourfit.cpp
 create mode 100644 singlecolourfit.h
 create mode 100644 squish.cpp
 create mode 100644 squish.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..61f621b
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,104 @@
+# cmake build file for squish
+# by Stefan Roettger (stefan@stereofx.org)
+# updated by Simon Brown (si@sjbrown.co.uk)
+
+# features:
+#   Xcode: builds universal binaries, uses SSE2 on i386 and Altivec on ppc
+#   Unix and VS: SSE2 support is enabled by default
+#   use BUILD_SQUISH_WITH_SSE2 and BUILD_SQUISH_WITH_ALTIVEC to override
+
+PROJECT(squish)
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8.3)
+
+OPTION(BUILD_SQUISH_WITH_SSE2 "Build with SSE2." ON)
+OPTION(BUILD_SQUISH_WITH_ALTIVEC "Build with Altivec." OFF)
+
+OPTION(BUILD_SHARED_LIBS "Build shared libraries." OFF)
+
+OPTION(BUILD_SQUISH_EXTRA "Build extra source code." OFF)
+
+IF (CMAKE_GENERATOR STREQUAL "Xcode")
+    SET(CMAKE_OSX_ARCHITECTURES "i386;ppc")
+ELSE (CMAKE_GENERATOR STREQUAL "Xcode")
+    IF (BUILD_SQUISH_WITH_SSE2 AND NOT WIN32)
+        ADD_DEFINITIONS(-DSQUISH_USE_SSE=2 -msse2)
+    ENDIF (BUILD_SQUISH_WITH_SSE2 AND NOT WIN32)
+    IF (BUILD_SQUISH_WITH_ALTIVEC AND NOT WIN32)
+        ADD_DEFINITIONS(-DSQUISH_USE_ALTIVEC=1 -maltivec)
+    ENDIF (BUILD_SQUISH_WITH_ALTIVEC AND NOT WIN32)
+ENDIF (CMAKE_GENERATOR STREQUAL "Xcode")
+
+SET(SQUISH_HDRS
+    squish.h
+    )
+
+SET(SQUISH_SRCS
+    alpha.cpp
+    alpha.h
+    clusterfit.cpp
+    clusterfit.h
+    colourblock.cpp
+    colourblock.h
+    colourfit.cpp
+    colourfit.h
+    colourset.cpp
+    colourset.h
+    maths.cpp
+    maths.h
+    rangefit.cpp
+    rangefit.h
+    simd.h
+    simd_float.h
+    simd_sse.h
+    simd_ve.h
+    singlecolourfit.cpp
+    singlecolourfit.h
+    singlecolourlookup.inl
+    squish.cpp
+    )
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+ADD_LIBRARY(squish ${SQUISH_SRCS} ${SQUISH_HDRS})
+
+SET_TARGET_PROPERTIES(
+    squish PROPERTIES
+    PUBLIC_HEADER "${SQUISH_HDRS}"
+    VERSION 0.0
+    SOVERSION 0.0
+    DEBUG_POSTFIX "d"
+    XCODE_ATTRIBUTE_GCC_PREPROCESSOR_DEFINITIONS "$(SQUISH_CPP_$(CURRENT_ARCH))"
+    XCODE_ATTRIBUTE_OTHER_CFLAGS "$(SQUISH_CFLAGS_$(CURRENT_ARCH))"
+    XCODE_ATTRIBUTE_SQUISH_CPP_i386 "SQUISH_USE_SSE=2"
+    XCODE_ATTRIBUTE_SQUISH_CFLAGS_i386 ""
+    XCODE_ATTRIBUTE_SQUISH_CPP_ppc "SQUISH_USE_ALTIVEC=1"
+    XCODE_ATTRIBUTE_SQUISH_CFLAGS_ppc "-maltivec"
+    )
+
+IF (BUILD_SQUISH_EXTRA)
+    SET(SQUISHTEST_SRCS extra/squishtest.cpp)
+
+    ADD_EXECUTABLE(squishtest ${SQUISHTEST_SRCS})
+    SET_TARGET_PROPERTIES(squishtest PROPERTIES DEBUG_POSTFIX "d")
+    TARGET_LINK_LIBRARIES(squishtest squish)
+
+    SET(SQUISHPNG_SRCS extra/squishpng.cpp)
+
+    FIND_PACKAGE(PNG)
+
+    IF (PNG_FOUND)
+        SET(CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES)
+        INCLUDE_DIRECTORIES(${PNG_INCLUDE_DIR})
+        ADD_EXECUTABLE(squishpng ${SQUISHPNG_SRCS})
+        SET_TARGET_PROPERTIES(squishpng PROPERTIES DEBUG_POSTFIX "d")
+        TARGET_LINK_LIBRARIES(squishpng squish ${PNG_LIBRARIES})
+    ENDIF (PNG_FOUND)
+ENDIF (BUILD_SQUISH_EXTRA)
+
+INSTALL(
+    TARGETS squish
+    LIBRARY DESTINATION lib
+    ARCHIVE DESTINATION lib
+    PUBLIC_HEADER DESTINATION include
+    )
diff --git a/CMakeModules/FindlibSquish.cmake b/CMakeModules/FindlibSquish.cmake
new file mode 100644
index 0000000..a8d7cfe
--- /dev/null
+++ b/CMakeModules/FindlibSquish.cmake
@@ -0,0 +1,14 @@
+# Defines
+#  LIBSQUISH_FOUND
+#  LIBSQUISH_INCLUDE_DIR
+#  LIBSQUISH_LIBRARIES
+
+FIND_PATH(LIBSQUISH_INCLUDE_DIR squish.h PATHS . squish .. ../squish DOC "Directory containing libSquish headers")
+FIND_LIBRARY(LIBSQUISH_LIBRARY NAMES squish libsquish PATHS . squish .. ../squish PATH_SUFFIXES lib lib64 release minsizerel relwithdebinfo DOC "Path to libSquish library")
+
+SET(LIBSQUISH_LIBRARIES ${LIBSQUISH_LIBRARY})
+
+IF (LIBSQUISH_LIBRARY AND LIBSQUISH_INCLUDE_DIR)
+   SET(LIBSQUISH_FOUND TRUE)
+   MESSAGE(STATUS "Found libSquish: ${LIBSQUISH_LIBRARY}")
+ENDIF (LIBSQUISH_LIBRARY AND LIBSQUISH_INCLUDE_DIR)
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..ba03f4c
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,52 @@
+1.10
+* Iterative cluster fit is now considered to be a new compression mode
+* The core cluster fit is now 4x faster using contributions by Ignacio
+Castano from NVIDIA
+* The single colour lookup table has been halved by exploiting symmetry
+
+1.9
+* Added contributed SSE1 truncate implementation
+* Changed use of SQUISH_USE_SSE to be 1 for SSE and 2 for SSE2 instructions
+* Cluster fit is now iterative to further reduce image error
+
+1.8
+* Switched from using floor to trunc for much better SSE performance (again)
+* Xcode build now expects libpng in /usr/local for extra/squishpng
+
+1.7
+* Fixed floating-point equality issue in clusterfit sort (x86 affected only)
+* Implemented proper SSE(2) floor function for 50% speedup on SSE builds 
+* The range fit implementation now uses the correct colour metric
+
+1.6
+* Fixed bug in CompressImage where masked pixels were not skipped over
+* DXT3 and DXT5 alpha compression now properly use the mask to ignore pixels
+* Fixed major DXT1 bug that can generate unexpected transparent pixels
+
+1.5
+* Added CompressMasked function to handle incomplete DXT blocks more cleanly
+* Added kWeightColourByAlpha flag for better quality images when alpha blending
+
+1.4
+* Fixed stack overflow in rangefit
+
+1.3
+* Worked around SSE floor implementation bug, proper fix needed!
+* This release has visual studio and makefile builds that work
+
+1.2
+* Added provably optimal single colour compressor
+* Added extra/squishgen.cpp that generates single colour lookup tables
+
+1.1
+* Fixed a DXT1 colour output bug
+* Changed argument order for Decompress function to match Compress
+* Added GetStorageRequirements function
+* Added CompressImage function
+* Added DecompressImage function
+* Moved squishtool.cpp to extra/squishpng.cpp
+* Added extra/squishtest.cpp
+
+1.0
+* Initial release
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..ed1c78d
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,20 @@
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README b/README
new file mode 100644
index 0000000..d26b72e
--- /dev/null
+++ b/README
@@ -0,0 +1,35 @@
+LICENSE
+-------
+
+The squish library is distributed under the terms and conditions of the MIT
+license. This license is specified at the top of each source file and must be
+preserved in its entirety.
+
+BUILDING AND INSTALLING THE LIBRARY
+-----------------------------------
+
+If you are using Visual Studio 2003 or above under Windows then load the Visual
+Studio 2003 project in the vs7 folder. By default, the library is built using
+SSE2 optimisations. To change this either change or remove the SQUISH_USE_SSE=2
+from the preprocessor symbols.
+
+If you are using a Mac then load the Xcode 2.2 project in the distribution. By
+default, the library is built using Altivec optimisations. To change this
+either change or remove SQUISH_USE_ALTIVEC=1 from the preprocessor symbols. I
+guess I'll have to think about changing this for the new Intel Macs that are
+rolling out...
+
+If you are using unix then first edit the config file in the base directory of
+the distribution, enabling Altivec or SSE with the USE_ALTIVEC or USE_SSE
+variables, and editing the optimisation flags passed to the C++ compiler if
+necessary. Then make can be used to build the library, and make install (from
+the superuser account) can be used to install (into /usr/local by default).
+
+REPORTING BUGS OR FEATURE REQUESTS
+----------------------------------
+
+Feedback can be sent to Simon Brown (the developer) at si@sjbrown.co.uk
+
+New releases are announced on the squish library homepage at
+http://sjbrown.co.uk/?code=squish
+
diff --git a/alpha.cpp b/alpha.cpp
new file mode 100644
index 0000000..0f94e21
--- /dev/null
+++ b/alpha.cpp
@@ -0,0 +1,350 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "alpha.h"
+
+#include <climits>
+#include <algorithm>
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+void CompressAlphaDxt3( u8 const* rgba, int mask, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// quantise and pack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		float alpha1 = ( float )rgba[8*i + 3] * ( 15.0f/255.0f );
+		float alpha2 = ( float )rgba[8*i + 7] * ( 15.0f/255.0f );
+		int quant1 = FloatToInt( alpha1, 15 );
+		int quant2 = FloatToInt( alpha2, 15 );
+		
+		// set alpha to zero where masked
+		int bit1 = 1 << ( 2*i );
+		int bit2 = 1 << ( 2*i + 1 );
+		if( ( mask & bit1 ) == 0 )
+			quant1 = 0;
+		if( ( mask & bit2 ) == 0 )
+			quant2 = 0;
+
+		// pack into the byte
+		bytes[i] = ( u8 )( quant1 | ( quant2 << 4 ) );
+	}
+}
+
+void DecompressAlphaDxt3( u8* rgba, void const* block )
+{
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		u8 quant = bytes[i];
+		
+		// unpack the values
+		u8 lo = quant & 0x0f;
+		u8 hi = quant & 0xf0;
+
+		// convert back up to bytes
+		rgba[8*i + 3] = lo | ( lo << 4 );
+		rgba[8*i + 7] = hi | ( hi >> 4 );
+	}
+}
+
+static void FixRange( int& min, int& max, int steps )
+{
+	if( max - min < steps )
+		max = std::min( min + steps, 255 );
+	if( max - min < steps )
+		min = std::max( 0, max - steps );
+}
+
+static int FitCodes( u8 const* rgba, int mask, u8 const* codes, u8* indices )
+{
+	// fit each alpha value to the codebook
+	int err = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is valid
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+		{
+			// use the first code
+			indices[i] = 0;
+			continue;
+		}
+		
+		// find the least error and corresponding index
+		int value = rgba[4*i + 3];
+		int least = INT_MAX;
+		int index = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			// get the squared error from this code
+			int dist = ( int )value - ( int )codes[j];
+			dist *= dist;
+			
+			// compare with the best so far
+			if( dist < least )
+			{
+				least = dist;
+				index = j;
+			}
+		}
+		
+		// save this index and accumulate the error
+		indices[i] = ( u8 )index;
+		err += least;
+	}
+	
+	// return the total error
+	return err;
+}
+
+static void WriteAlphaBlock( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// write the first two bytes
+	bytes[0] = ( u8 )alpha0;
+	bytes[1] = ( u8 )alpha1;
+	
+	// pack the indices with 3 bits each
+	u8* dest = bytes + 2;
+	u8 const* src = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// pack 8 3-bit values
+		int value = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = *src++;
+			value |= ( index << 3*j );
+		}
+			
+		// store in 3 bytes
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = ( value >> 8*j ) & 0xff;
+			*dest++ = ( u8 )byte;
+		}
+	}
+}
+
+static void WriteAlphaBlock5( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 > alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else if( index <= 5 )
+				swapped[i] = 7 - index;
+			else 
+				swapped[i] = index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+static void WriteAlphaBlock7( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 < alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else
+				swapped[i] = 9 - index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+void CompressAlphaDxt5( u8 const* rgba, int mask, void* block )
+{
+	// get the range for 5-alpha and 7-alpha interpolation
+	int min5 = 255;
+	int max5 = 0;
+	int min7 = 255;
+	int max7 = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is valid
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+			continue;
+
+		// incorporate into the min/max
+		int value = rgba[4*i + 3];
+		if( value < min7 )
+			min7 = value;
+		if( value > max7 )
+			max7 = value;
+		if( value != 0 && value < min5 )
+			min5 = value;
+		if( value != 255 && value > max5 )
+			max5 = value;
+	}
+	
+	// handle the case that no valid range was found
+	if( min5 > max5 )
+		min5 = max5;
+	if( min7 > max7 )
+		min7 = max7;
+		
+	// fix the range to be the minimum in each case
+	FixRange( min5, max5, 5 );
+	FixRange( min7, max7, 7 );
+	
+	// set up the 5-alpha code book
+	u8 codes5[8];
+	codes5[0] = ( u8 )min5;
+	codes5[1] = ( u8 )max5;
+	for( int i = 1; i < 5; ++i )
+		codes5[1 + i] = ( u8 )( ( ( 5 - i )*min5 + i*max5 )/5 );
+	codes5[6] = 0;
+	codes5[7] = 255;
+	
+	// set up the 7-alpha code book
+	u8 codes7[8];
+	codes7[0] = ( u8 )min7;
+	codes7[1] = ( u8 )max7;
+	for( int i = 1; i < 7; ++i )
+		codes7[1 + i] = ( u8 )( ( ( 7 - i )*min7 + i*max7 )/7 );
+		
+	// fit the data to both code books
+	u8 indices5[16];
+	u8 indices7[16];
+	int err5 = FitCodes( rgba, mask, codes5, indices5 );
+	int err7 = FitCodes( rgba, mask, codes7, indices7 );
+	
+	// save the block with least error
+	if( err5 <= err7 )
+		WriteAlphaBlock5( min5, max5, indices5, block );
+	else
+		WriteAlphaBlock7( min7, max7, indices7, block );
+}
+
+void DecompressAlphaDxt5( u8* rgba, void const* block )
+{
+	// get the two alpha values
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	int alpha0 = bytes[0];
+	int alpha1 = bytes[1];
+	
+	// compare the values to build the codebook
+	u8 codes[8];
+	codes[0] = ( u8 )alpha0;
+	codes[1] = ( u8 )alpha1;
+	if( alpha0 <= alpha1 )
+	{
+		// use 5-alpha codebook
+		for( int i = 1; i < 5; ++i )
+			codes[1 + i] = ( u8 )( ( ( 5 - i )*alpha0 + i*alpha1 )/5 );
+		codes[6] = 0;
+		codes[7] = 255;
+	}
+	else
+	{
+		// use 7-alpha codebook
+		for( int i = 1; i < 7; ++i )
+			codes[1 + i] = ( u8 )( ( ( 7 - i )*alpha0 + i*alpha1 )/7 );
+	}
+	
+	// decode the indices
+	u8 indices[16];
+	u8 const* src = bytes + 2;
+	u8* dest = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// grab 3 bytes
+		int value = 0;
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = *src++;
+			value |= ( byte << 8*j );
+		}
+		
+		// unpack 8 3-bit values from it
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = ( value >> 3*j ) & 0x7;
+			*dest++ = ( u8 )index;
+		}
+	}
+	
+	// write out the indexed codebook values
+	for( int i = 0; i < 16; ++i )
+		rgba[4*i + 3] = codes[indices[i]];
+}
+
+} // namespace squish
diff --git a/alpha.h b/alpha.h
new file mode 100644
index 0000000..e5e7f32
--- /dev/null
+++ b/alpha.h
@@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_ALPHA_H
+#define SQUISH_ALPHA_H
+
+#include "squish.h"
+
+namespace squish {
+
+void CompressAlphaDxt3( u8 const* rgba, int mask, void* block );
+void CompressAlphaDxt5( u8 const* rgba, int mask, void* block );
+
+void DecompressAlphaDxt3( u8* rgba, void const* block );
+void DecompressAlphaDxt5( u8* rgba, void const* block );
+
+} // namespace squish
+
+#endif // ndef SQUISH_ALPHA_H
diff --git a/clusterfit.cpp b/clusterfit.cpp
new file mode 100644
index 0000000..9670446
--- /dev/null
+++ b/clusterfit.cpp
@@ -0,0 +1,392 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2007 Ignacio Castano                   icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "clusterfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+ClusterFit::ClusterFit( ColourSet const* colours, int flags, float* metric ) 
+  : ColourFit( colours, flags )
+{
+	// set the iteration count
+	m_iterationCount = ( m_flags & kColourIterativeClusterFit ) ? kMaxIterations : 1;
+
+	// initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+	if( metric )
+		m_metric = Vec4( metric[0], metric[1], metric[2], 1.0f );
+	else
+		m_metric = VEC4_CONST( 1.0f );	
+
+	// initialise the best error
+	m_besterror = VEC4_CONST( FLT_MAX );
+
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+	
+	// compute the principle component
+	m_principle = ComputePrincipleComponent( covariance );
+}
+
+bool ClusterFit::ConstructOrdering( Vec3 const& axis, int iteration )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+
+	// build the list of dot products
+	float dps[16];
+	u8* order = ( u8* )m_order + 16*iteration;
+	for( int i = 0; i < count; ++i )
+	{
+		dps[i] = Dot( values[i], axis );
+		order[i] = ( u8 )i;
+	}
+		
+	// stable sort using them
+	for( int i = 0; i < count; ++i )
+	{
+		for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
+		{
+			std::swap( dps[j], dps[j - 1] );
+			std::swap( order[j], order[j - 1] );
+		}
+	}
+	
+	// check this ordering is unique
+	for( int it = 0; it < iteration; ++it )
+	{
+		u8 const* prev = ( u8* )m_order + 16*it;
+		bool same = true;
+		for( int i = 0; i < count; ++i )
+		{
+			if( order[i] != prev[i] )
+			{
+				same = false;
+				break;
+			}
+		}
+		if( same )
+			return false;
+	}
+	
+	// copy the ordering and weight all the points
+	Vec3 const* unweighted = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	m_xsum_wsum = VEC4_CONST( 0.0f );
+	for( int i = 0; i < count; ++i )
+	{
+		int j = order[i];
+		Vec4 p( unweighted[j].X(), unweighted[j].Y(), unweighted[j].Z(), 1.0f );
+		Vec4 w( weights[j] );
+		Vec4 x = p*w;
+		m_points_weights[i] = x;
+		m_xsum_wsum += x;
+	}
+	return true;
+}
+
+void ClusterFit::Compress3( void* block )
+{
+	// declare variables
+	int const count = m_colours->GetCount();
+	Vec4 const two = VEC4_CONST( 2.0 );
+	Vec4 const one = VEC4_CONST( 1.0f );
+	Vec4 const half_half2( 0.5f, 0.5f, 0.5f, 0.25f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+	// prepare an ordering using the principle axis
+	ConstructOrdering( m_principle, 0 );
+	
+	// check all possible clusters and iterate on the total order
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = m_besterror;
+	u8 bestindices[16];
+	int bestiteration = 0;
+	int besti = 0, bestj = 0;
+	
+	// loop over iterations (we avoid the case that all points in first or last cluster)
+	for( int iterationIndex = 0;; )
+	{
+		// first cluster [0,i) is at the start
+		Vec4 part0 = VEC4_CONST( 0.0f );
+		for( int i = 0; i < count; ++i )
+		{
+			// second cluster [i,j) is half along
+			Vec4 part1 = ( i == 0 ) ? m_points_weights[0] : VEC4_CONST( 0.0f );
+			int jmin = ( i == 0 ) ? 1 : i;
+			for( int j = jmin;; )
+			{
+				// last cluster [j,count) is at the end
+				Vec4 part2 = m_xsum_wsum - part1 - part0;
+				
+				// compute least squares terms directly
+				Vec4 alphax_sum = MultiplyAdd( part1, half_half2, part0 );
+				Vec4 alpha2_sum = alphax_sum.SplatW();
+
+				Vec4 betax_sum = MultiplyAdd( part1, half_half2, part2 );
+				Vec4 beta2_sum = betax_sum.SplatW();
+
+				Vec4 alphabeta_sum = ( part1*half_half2 ).SplatW();
+
+				// compute the least-squares optimal points
+				Vec4 factor = Reciprocal( NegativeMultiplySubtract( alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum ) );
+				Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_sum, alphax_sum*beta2_sum )*factor;
+				Vec4 b = NegativeMultiplySubtract( alphax_sum, alphabeta_sum, betax_sum*alpha2_sum )*factor;
+
+				// clamp to the grid
+				a = Min( one, Max( zero, a ) );
+				b = Min( one, Max( zero, b ) );
+				a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+				b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+				
+				// compute the error (we skip the constant xxsum)
+				Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+				Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+				Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+				Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+				// apply the metric to the error term
+				Vec4 e5 = e4*m_metric;
+				Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+				
+				// keep the solution if it wins
+				if( CompareAnyLessThan( error, besterror ) )
+				{
+					beststart = a;
+					bestend = b;
+					besti = i;
+					bestj = j;
+					besterror = error;
+					bestiteration = iterationIndex;
+				}
+
+				// advance
+				if( j == count )
+					break;
+				part1 += m_points_weights[j];
+				++j;
+			}
+
+			// advance
+			part0 += m_points_weights[i];
+		}
+		
+		// stop if we didn't improve in this iteration
+		if( bestiteration != iterationIndex )
+			break;
+			
+		// advance if possible
+		++iterationIndex;
+		if( iterationIndex == m_iterationCount )
+			break;
+			
+		// stop if a new iteration is an ordering that has already been tried
+		Vec3 axis = ( bestend - beststart ).GetVec3();
+		if( !ConstructOrdering( axis, iterationIndex ) )
+			break;
+	}
+		
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// remap the indices
+		u8 const* order = ( u8* )m_order + 16*bestiteration;
+
+		u8 unordered[16];
+		for( int m = 0; m < besti; ++m )
+			unordered[order[m]] = 0;
+		for( int m = besti; m < bestj; ++m )
+			unordered[order[m]] = 2;
+		for( int m = bestj; m < count; ++m )
+			unordered[order[m]] = 1;
+
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+void ClusterFit::Compress4( void* block )
+{
+	// declare variables
+	int const count = m_colours->GetCount();
+	Vec4 const two = VEC4_CONST( 2.0f );
+	Vec4 const one = VEC4_CONST( 1.0f );
+	Vec4 const onethird_onethird2( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+	Vec4 const twothirds_twothirds2( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+	Vec4 const twonineths = VEC4_CONST( 2.0f/9.0f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+	// prepare an ordering using the principle axis
+	ConstructOrdering( m_principle, 0 );
+	
+	// check all possible clusters and iterate on the total order
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = m_besterror;
+	u8 bestindices[16];
+	int bestiteration = 0;
+	int besti = 0, bestj = 0, bestk = 0;
+	
+	// loop over iterations (we avoid the case that all points in first or last cluster)
+	for( int iterationIndex = 0;; )
+	{
+		// first cluster [0,i) is at the start
+		Vec4 part0 = VEC4_CONST( 0.0f );
+		for( int i = 0; i < count; ++i )
+		{
+			// second cluster [i,j) is one third along
+			Vec4 part1 = VEC4_CONST( 0.0f );
+			for( int j = i;; )
+			{
+				// third cluster [j,k) is two thirds along
+				Vec4 part2 = ( j == 0 ) ? m_points_weights[0] : VEC4_CONST( 0.0f );
+				int kmin = ( j == 0 ) ? 1 : j;
+				for( int k = kmin;; )
+				{
+					// last cluster [k,count) is at the end
+					Vec4 part3 = m_xsum_wsum - part2 - part1 - part0;
+
+					// compute least squares terms directly
+					Vec4 const alphax_sum = MultiplyAdd( part2, onethird_onethird2, MultiplyAdd( part1, twothirds_twothirds2, part0 ) );
+					Vec4 const alpha2_sum = alphax_sum.SplatW();
+					
+					Vec4 const betax_sum = MultiplyAdd( part1, onethird_onethird2, MultiplyAdd( part2, twothirds_twothirds2, part3 ) );
+					Vec4 const beta2_sum = betax_sum.SplatW();
+					
+					Vec4 const alphabeta_sum = twonineths*( part1 + part2 ).SplatW();
+
+					// compute the least-squares optimal points
+					Vec4 factor = Reciprocal( NegativeMultiplySubtract( alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum ) );
+					Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_sum, alphax_sum*beta2_sum )*factor;
+					Vec4 b = NegativeMultiplySubtract( alphax_sum, alphabeta_sum, betax_sum*alpha2_sum )*factor;
+
+					// clamp to the grid
+					a = Min( one, Max( zero, a ) );
+					b = Min( one, Max( zero, b ) );
+					a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+					b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+					
+					// compute the error (we skip the constant xxsum)
+					Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+					Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+					Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+					Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+					// apply the metric to the error term
+					Vec4 e5 = e4*m_metric;
+					Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+
+					// keep the solution if it wins
+					if( CompareAnyLessThan( error, besterror ) )
+					{
+						beststart = a;
+						bestend = b;
+						besterror = error;
+						besti = i;
+						bestj = j;
+						bestk = k;
+						bestiteration = iterationIndex;
+					}
+
+					// advance
+					if( k == count )
+						break;
+					part2 += m_points_weights[k];
+					++k;
+				}
+
+				// advance
+				if( j == count )
+					break;
+				part1 += m_points_weights[j];
+				++j;
+			}
+
+			// advance
+			part0 += m_points_weights[i];
+		}
+		
+		// stop if we didn't improve in this iteration
+		if( bestiteration != iterationIndex )
+			break;
+			
+		// advance if possible
+		++iterationIndex;
+		if( iterationIndex == m_iterationCount )
+			break;
+			
+		// stop if a new iteration is an ordering that has already been tried
+		Vec3 axis = ( bestend - beststart ).GetVec3();
+		if( !ConstructOrdering( axis, iterationIndex ) )
+			break;
+	}
+
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// remap the indices
+		u8 const* order = ( u8* )m_order + 16*bestiteration;
+
+		u8 unordered[16];
+		for( int m = 0; m < besti; ++m )
+			unordered[order[m]] = 0;
+		for( int m = besti; m < bestj; ++m )
+			unordered[order[m]] = 2;
+		for( int m = bestj; m < bestk; ++m )
+			unordered[order[m]] = 3;
+		for( int m = bestk; m < count; ++m )
+			unordered[order[m]] = 1;
+
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+} // namespace squish
diff --git a/clusterfit.h b/clusterfit.h
new file mode 100644
index 0000000..c882469
--- /dev/null
+++ b/clusterfit.h
@@ -0,0 +1,61 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2007 Ignacio Castano                   icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CLUSTERFIT_H
+#define SQUISH_CLUSTERFIT_H
+
+#include "squish.h"
+#include "maths.h"
+#include "simd.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ClusterFit : public ColourFit
+{
+public:
+	ClusterFit( ColourSet const* colours, int flags, float* metric );
+	
+private:
+	bool ConstructOrdering( Vec3 const& axis, int iteration );
+
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+
+	enum { kMaxIterations = 8 };
+
+	int m_iterationCount;
+	Vec3 m_principle;
+	u8 m_order[16*kMaxIterations];
+	Vec4 m_points_weights[16];
+	Vec4 m_xsum_wsum;
+	Vec4 m_metric;
+	Vec4 m_besterror;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_CLUSTERFIT_H
diff --git a/colourblock.cpp b/colourblock.cpp
new file mode 100644
index 0000000..e6a5788
--- /dev/null
+++ b/colourblock.cpp
@@ -0,0 +1,214 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourblock.h"
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+static int FloatTo565( Vec3::Arg colour )
+{
+	// get the components in the correct range
+	int r = FloatToInt( 31.0f*colour.X(), 31 );
+	int g = FloatToInt( 63.0f*colour.Y(), 63 );
+	int b = FloatToInt( 31.0f*colour.Z(), 31 );
+	
+	// pack into a single value
+	return ( r << 11 ) | ( g << 5 ) | b;
+}
+
+static void WriteColourBlock( int a, int b, u8* indices, void* block )
+{
+	// get the block as bytes
+	u8* bytes = ( u8* )block;
+
+	// write the endpoints
+	bytes[0] = ( u8 )( a & 0xff );
+	bytes[1] = ( u8 )( a >> 8 );
+	bytes[2] = ( u8 )( b & 0xff );
+	bytes[3] = ( u8 )( b >> 8 );
+	
+	// write the indices
+	for( int i = 0; i < 4; ++i )
+	{
+		u8 const* ind = indices + 4*i;
+		bytes[4 + i] = ind[0] | ( ind[1] << 2 ) | ( ind[2] << 4 ) | ( ind[3] << 6 );
+	}
+}
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a <= b )
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	else
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+		{
+			if( indices[i] == 0 )
+				remapped[i] = 1;
+			else if( indices[i] == 1 )
+				remapped[i] = 0;
+			else
+				remapped[i] = indices[i];
+		}
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a < b )
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = ( indices[i] ^ 0x1 ) & 0x3;
+	}
+	else if( a == b )
+	{
+		// use index 0
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = 0;
+	}
+	else
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+static int Unpack565( u8 const* packed, u8* colour )
+{
+	// build the packed value
+	int value = ( int )packed[0] | ( ( int )packed[1] << 8 );
+	
+	// get the components in the stored range
+	u8 red = ( u8 )( ( value >> 11 ) & 0x1f );
+	u8 green = ( u8 )( ( value >> 5 ) & 0x3f );
+	u8 blue = ( u8 )( value & 0x1f );
+
+	// scale up to 8 bits
+	colour[0] = ( red << 3 ) | ( red >> 2 );
+	colour[1] = ( green << 2 ) | ( green >> 4 );
+	colour[2] = ( blue << 3 ) | ( blue >> 2 );
+	colour[3] = 255;
+	
+	// return the value
+	return value;
+}
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 )
+{
+	// get the block bytes
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the endpoints
+	u8 codes[16];
+	int a = Unpack565( bytes, codes );
+	int b = Unpack565( bytes + 2, codes + 4 );
+	
+	// generate the midpoints
+	for( int i = 0; i < 3; ++i )
+	{
+		int c = codes[i];
+		int d = codes[4 + i];
+
+		if( isDxt1 && a <= b )
+		{
+			codes[8 + i] = ( u8 )( ( c + d )/2 );
+			codes[12 + i] = 0;
+		}
+		else
+		{
+			codes[8 + i] = ( u8 )( ( 2*c + d )/3 );
+			codes[12 + i] = ( u8 )( ( c + 2*d )/3 );
+		}
+	}
+	
+	// fill in alpha for the intermediate values
+	codes[8 + 3] = 255;
+	codes[12 + 3] = ( isDxt1 && a <= b ) ? 0 : 255;
+	
+	// unpack the indices
+	u8 indices[16];
+	for( int i = 0; i < 4; ++i )
+	{
+		u8* ind = indices + 4*i;
+		u8 packed = bytes[4 + i];
+		
+		ind[0] = packed & 0x3;
+		ind[1] = ( packed >> 2 ) & 0x3;
+		ind[2] = ( packed >> 4 ) & 0x3;
+		ind[3] = ( packed >> 6 ) & 0x3;
+	}
+
+	// store out the colours
+	for( int i = 0; i < 16; ++i )
+	{
+		u8 offset = 4*indices[i];
+		for( int j = 0; j < 4; ++j )
+			rgba[4*i + j] = codes[offset + j];
+	}
+}
+
+} // namespace squish
diff --git a/colourblock.h b/colourblock.h
new file mode 100644
index 0000000..2562561
--- /dev/null
+++ b/colourblock.h
@@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURBLOCK_H
+#define SQUISH_COLOURBLOCK_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 );
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURBLOCK_H
diff --git a/colourfit.cpp b/colourfit.cpp
new file mode 100644
index 0000000..11efa46
--- /dev/null
+++ b/colourfit.cpp
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourfit.h"
+#include "colourset.h"
+
+namespace squish {
+
+ColourFit::ColourFit( ColourSet const* colours, int flags ) 
+  : m_colours( colours ), 
+	m_flags( flags )
+{
+}
+
+ColourFit::~ColourFit()
+{
+}
+
+void ColourFit::Compress( void* block )
+{
+	bool isDxt1 = ( ( m_flags & kDxt1 ) != 0 );
+	if( isDxt1 )
+	{
+		Compress3( block );
+		if( !m_colours->IsTransparent() )
+			Compress4( block );
+	}
+	else
+		Compress4( block );
+}
+
+} // namespace squish
diff --git a/colourfit.h b/colourfit.h
new file mode 100644
index 0000000..7593223
--- /dev/null
+++ b/colourfit.h
@@ -0,0 +1,56 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURFIT_H
+#define SQUISH_COLOURFIT_H
+
+#include "squish.h"
+#include "maths.h"
+
+#include <climits>
+
+namespace squish {
+
+class ColourSet;
+
+class ColourFit
+{
+public:
+	ColourFit( ColourSet const* colours, int flags );
+	virtual ~ColourFit();
+
+	void Compress( void* block );
+
+protected:
+	virtual void Compress3( void* block ) = 0;
+	virtual void Compress4( void* block ) = 0;
+
+	ColourSet const* m_colours;
+	int m_flags;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURFIT_H
diff --git a/colourset.cpp b/colourset.cpp
new file mode 100644
index 0000000..97d29d9
--- /dev/null
+++ b/colourset.cpp
@@ -0,0 +1,121 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourset.h"
+
+namespace squish {
+
+ColourSet::ColourSet( u8 const* rgba, int mask, int flags )
+  : m_count( 0 ), 
+	m_transparent( false )
+{
+	// check the compression mode for dxt1
+	bool isDxt1 = ( ( flags & kDxt1 ) != 0 );
+	bool weightByAlpha = ( ( flags & kWeightColourByAlpha ) != 0 );
+
+	// create the minimal set
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is enabled
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+		{
+			m_remap[i] = -1;
+			continue;
+		}
+	
+		// check for transparent pixels when using dxt1
+		if( isDxt1 && rgba[4*i + 3] < 128 )
+		{
+			m_remap[i] = -1;
+			m_transparent = true;
+			continue;
+		}
+
+		// loop over previous points for a match
+		for( int j = 0;; ++j )
+		{
+			// allocate a new point
+			if( j == i )
+			{
+				// normalise coordinates to [0,1]
+				float x = ( float )rgba[4*i] / 255.0f;
+				float y = ( float )rgba[4*i + 1] / 255.0f;
+				float z = ( float )rgba[4*i + 2] / 255.0f;
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// add the point
+				m_points[m_count] = Vec3( x, y, z );
+				m_weights[m_count] = ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = m_count;
+				
+				// advance
+				++m_count;
+				break;
+			}
+		
+			// check for a match
+			int oldbit = 1 << j;
+			bool match = ( ( mask & oldbit ) != 0 )
+				&& ( rgba[4*i] == rgba[4*j] )
+				&& ( rgba[4*i + 1] == rgba[4*j + 1] )
+				&& ( rgba[4*i + 2] == rgba[4*j + 2] )
+				&& ( rgba[4*j + 3] >= 128 || !isDxt1 );
+			if( match )
+			{
+				// get the index of the match
+				int index = m_remap[j];
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// map to this point and increase the weight
+				m_weights[index] += ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = index;
+				break;
+			}
+		}
+	}
+
+	// square root the weights
+	for( int i = 0; i < m_count; ++i )
+		m_weights[i] = std::sqrt( m_weights[i] );
+}
+
+void ColourSet::RemapIndices( u8 const* source, u8* target ) const
+{
+	for( int i = 0; i < 16; ++i )
+	{
+		int j = m_remap[i];
+		if( j == -1 )
+			target[i] = 3;
+		else
+			target[i] = source[j];
+	}
+}
+
+} // namespace squish
diff --git a/colourset.h b/colourset.h
new file mode 100644
index 0000000..0c66fe4
--- /dev/null
+++ b/colourset.h
@@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURSET_H
+#define SQUISH_COLOURSET_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+/*! @brief Represents a set of block colours
+*/
+class ColourSet
+{
+public:
+	ColourSet( u8 const* rgba, int mask, int flags );
+
+	int GetCount() const { return m_count; }
+	Vec3 const* GetPoints() const { return m_points; }
+	float const* GetWeights() const { return m_weights; }
+	bool IsTransparent() const { return m_transparent; }
+
+	void RemapIndices( u8 const* source, u8* target ) const;
+
+private:
+	int m_count;
+	Vec3 m_points[16];
+	float m_weights[16];
+	int m_remap[16];
+	bool m_transparent;
+};
+
+} // namespace sqish
+
+#endif // ndef SQUISH_COLOURSET_H
diff --git a/config.h b/config.h
new file mode 100644
index 0000000..2fad557
--- /dev/null
+++ b/config.h
@@ -0,0 +1,49 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CONFIG_H
+#define SQUISH_CONFIG_H
+
+// Set to 1 when building squish to use Altivec instructions.
+#ifndef SQUISH_USE_ALTIVEC
+#define SQUISH_USE_ALTIVEC 0
+#endif
+
+// Set to 1 or 2 when building squish to use SSE or SSE2 instructions.
+#ifndef SQUISH_USE_SSE
+#define SQUISH_USE_SSE 0
+#endif
+
+// Internally set SQUISH_USE_SIMD when either Altivec or SSE is available.
+#if SQUISH_USE_ALTIVEC && SQUISH_USE_SSE
+#error "Cannot enable both Altivec and SSE!"
+#endif
+#if SQUISH_USE_ALTIVEC || SQUISH_USE_SSE
+#define SQUISH_USE_SIMD 1
+#else
+#define SQUISH_USE_SIMD 0
+#endif
+
+#endif // ndef SQUISH_CONFIG_H
diff --git a/maths.cpp b/maths.cpp
new file mode 100644
index 0000000..9af4197
--- /dev/null
+++ b/maths.cpp
@@ -0,0 +1,259 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+/*! @file
+
+	The symmetric eigensystem solver algorithm is from 
+	http://www.geometrictools.com/Documentation/EigenSymmetric3x3.pdf
+*/
+
+#include "maths.h"
+#include "simd.h"
+#include <cfloat>
+
+namespace squish {
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights )
+{
+	// compute the centroid
+	float total = 0.0f;
+	Vec3 centroid( 0.0f );
+	for( int i = 0; i < n; ++i )
+	{
+		total += weights[i];
+		centroid += weights[i]*points[i];
+	}
+	if( total > FLT_EPSILON )
+		centroid /= total;
+
+	// accumulate the covariance matrix
+	Sym3x3 covariance( 0.0f );
+	for( int i = 0; i < n; ++i )
+	{
+		Vec3 a = points[i] - centroid;
+		Vec3 b = weights[i]*a;
+		
+		covariance[0] += a.X()*b.X();
+		covariance[1] += a.X()*b.Y();
+		covariance[2] += a.X()*b.Z();
+		covariance[3] += a.Y()*b.Y();
+		covariance[4] += a.Y()*b.Z();
+		covariance[5] += a.Z()*b.Z();
+	}
+	
+	// return it
+	return covariance;
+}
+
+#if 0
+
+static Vec3 GetMultiplicity1Evector( Sym3x3 const& matrix, float evalue )
+{
+	// compute M
+	Sym3x3 m;
+	m[0] = matrix[0] - evalue;
+	m[1] = matrix[1];
+	m[2] = matrix[2];
+	m[3] = matrix[3] - evalue;
+	m[4] = matrix[4];
+	m[5] = matrix[5] - evalue;
+
+	// compute U
+	Sym3x3 u;
+	u[0] = m[3]*m[5] - m[4]*m[4];
+	u[1] = m[2]*m[4] - m[1]*m[5];
+	u[2] = m[1]*m[4] - m[2]*m[3];
+	u[3] = m[0]*m[5] - m[2]*m[2];
+	u[4] = m[1]*m[2] - m[4]*m[0];
+	u[5] = m[0]*m[3] - m[1]*m[1];
+
+	// find the largest component
+	float mc = std::fabs( u[0] );
+	int mi = 0;
+	for( int i = 1; i < 6; ++i )
+	{
+		float c = std::fabs( u[i] );
+		if( c > mc )
+		{
+			mc = c;
+			mi = i;
+		}
+	}
+
+	// pick the column with this component
+	switch( mi )
+	{
+	case 0:
+		return Vec3( u[0], u[1], u[2] );
+
+	case 1:
+	case 3:
+		return Vec3( u[1], u[3], u[4] );
+
+	default:
+		return Vec3( u[2], u[4], u[5] );
+	}
+}
+
+static Vec3 GetMultiplicity2Evector( Sym3x3 const& matrix, float evalue )
+{
+	// compute M
+	Sym3x3 m;
+	m[0] = matrix[0] - evalue;
+	m[1] = matrix[1];
+	m[2] = matrix[2];
+	m[3] = matrix[3] - evalue;
+	m[4] = matrix[4];
+	m[5] = matrix[5] - evalue;
+
+	// find the largest component
+	float mc = std::fabs( m[0] );
+	int mi = 0;
+	for( int i = 1; i < 6; ++i )
+	{
+		float c = std::fabs( m[i] );
+		if( c > mc )
+		{
+			mc = c;
+			mi = i;
+		}
+	}
+
+	// pick the first eigenvector based on this index
+	switch( mi )
+	{
+	case 0:
+	case 1:
+		return Vec3( -m[1], m[0], 0.0f );
+
+	case 2:
+		return Vec3( m[2], 0.0f, -m[0] );
+
+	case 3:
+	case 4:
+		return Vec3( 0.0f, -m[4], m[3] );
+
+	default:
+		return Vec3( 0.0f, -m[5], m[4] );
+	}
+}
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+	// compute the cubic coefficients
+	float c0 = matrix[0]*matrix[3]*matrix[5] 
+		+ 2.0f*matrix[1]*matrix[2]*matrix[4] 
+		- matrix[0]*matrix[4]*matrix[4] 
+		- matrix[3]*matrix[2]*matrix[2] 
+		- matrix[5]*matrix[1]*matrix[1];
+	float c1 = matrix[0]*matrix[3] + matrix[0]*matrix[5] + matrix[3]*matrix[5]
+		- matrix[1]*matrix[1] - matrix[2]*matrix[2] - matrix[4]*matrix[4];
+	float c2 = matrix[0] + matrix[3] + matrix[5];
+
+	// compute the quadratic coefficients
+	float a = c1 - ( 1.0f/3.0f )*c2*c2;
+	float b = ( -2.0f/27.0f )*c2*c2*c2 + ( 1.0f/3.0f )*c1*c2 - c0;
+
+	// compute the root count check
+	float Q = 0.25f*b*b + ( 1.0f/27.0f )*a*a*a;
+
+	// test the multiplicity
+	if( FLT_EPSILON < Q )
+	{
+		// only one root, which implies we have a multiple of the identity
+        return Vec3( 1.0f );
+	}
+	else if( Q < -FLT_EPSILON )
+	{
+		// three distinct roots
+		float theta = std::atan2( std::sqrt( -Q ), -0.5f*b );
+		float rho = std::sqrt( 0.25f*b*b - Q );
+
+		float rt = std::pow( rho, 1.0f/3.0f );
+		float ct = std::cos( theta/3.0f );
+		float st = std::sin( theta/3.0f );
+
+		float l1 = ( 1.0f/3.0f )*c2 + 2.0f*rt*ct;
+		float l2 = ( 1.0f/3.0f )*c2 - rt*( ct + ( float )sqrt( 3.0f )*st );
+		float l3 = ( 1.0f/3.0f )*c2 - rt*( ct - ( float )sqrt( 3.0f )*st );
+
+		// pick the larger
+		if( std::fabs( l2 ) > std::fabs( l1 ) )
+			l1 = l2;
+		if( std::fabs( l3 ) > std::fabs( l1 ) )
+			l1 = l3;
+
+		// get the eigenvector
+		return GetMultiplicity1Evector( matrix, l1 );
+	}
+	else // if( -FLT_EPSILON <= Q && Q <= FLT_EPSILON )
+	{
+		// two roots
+		float rt;
+		if( b < 0.0f )
+			rt = -std::pow( -0.5f*b, 1.0f/3.0f );
+		else
+			rt = std::pow( 0.5f*b, 1.0f/3.0f );
+		
+		float l1 = ( 1.0f/3.0f )*c2 + rt;		// repeated
+		float l2 = ( 1.0f/3.0f )*c2 - 2.0f*rt;
+		
+		// get the eigenvector
+		if( std::fabs( l1 ) > std::fabs( l2 ) )
+			return GetMultiplicity2Evector( matrix, l1 );
+		else
+			return GetMultiplicity1Evector( matrix, l2 );
+	}
+}
+
+#else
+
+#define POWER_ITERATION_COUNT 	8
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+	Vec4 const row0( matrix[0], matrix[1], matrix[2], 0.0f );
+	Vec4 const row1( matrix[1], matrix[3], matrix[4], 0.0f );
+	Vec4 const row2( matrix[2], matrix[4], matrix[5], 0.0f );
+	Vec4 v = VEC4_CONST( 1.0f );
+	for( int i = 0; i < POWER_ITERATION_COUNT; ++i )
+	{
+		// matrix multiply
+		Vec4 w = row0*v.SplatX();
+		w = MultiplyAdd(row1, v.SplatY(), w);
+		w = MultiplyAdd(row2, v.SplatZ(), w);
+
+		// get max component from xyz in all channels
+		Vec4 a = Max(w.SplatX(), Max(w.SplatY(), w.SplatZ()));
+
+		// divide through and advance
+		v = w*Reciprocal(a);
+	}
+	return v.GetVec3();
+}
+
+#endif
+
+} // namespace squish
diff --git a/maths.h b/maths.h
new file mode 100644
index 0000000..769ae46
--- /dev/null
+++ b/maths.h
@@ -0,0 +1,233 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_MATHS_H
+#define SQUISH_MATHS_H
+
+#include <cmath>
+#include <algorithm>
+#include "config.h"
+
+namespace squish {
+
+class Vec3
+{
+public:
+	typedef Vec3 const& Arg;
+
+	Vec3()
+	{
+	}
+
+	explicit Vec3( float s )
+	{
+		m_x = s;
+		m_y = s;
+		m_z = s;
+	}
+
+	Vec3( float x, float y, float z )
+	{
+		m_x = x;
+		m_y = y;
+		m_z = z;
+	}
+	
+	float X() const { return m_x; }
+	float Y() const { return m_y; }
+	float Z() const { return m_z; }
+	
+	Vec3 operator-() const
+	{
+		return Vec3( -m_x, -m_y, -m_z );
+	}
+	
+	Vec3& operator+=( Arg v )
+	{
+		m_x += v.m_x;
+		m_y += v.m_y;
+		m_z += v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator-=( Arg v )
+	{
+		m_x -= v.m_x;
+		m_y -= v.m_y;
+		m_z -= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator*=( Arg v )
+	{
+		m_x *= v.m_x;
+		m_y *= v.m_y;
+		m_z *= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator*=( float s )
+	{
+		m_x *= s;
+		m_y *= s;
+		m_z *= s;
+		return *this;
+	}
+	
+	Vec3& operator/=( Arg v )
+	{
+		m_x /= v.m_x;
+		m_y /= v.m_y;
+		m_z /= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator/=( float s )
+	{
+		float t = 1.0f/s;
+		m_x *= t;
+		m_y *= t;
+		m_z *= t;
+		return *this;
+	}
+	
+	friend Vec3 operator+( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy += right;
+	}
+	
+	friend Vec3 operator-( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy -= right;
+	}
+	
+	friend Vec3 operator*( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy *= right;
+	}
+	
+	friend Vec3 operator*( Arg left, float right )
+	{
+		Vec3 copy( left );
+		return copy *= right;
+	}
+	
+	friend Vec3 operator*( float left, Arg right )
+	{
+		Vec3 copy( right );
+		return copy *= left;
+	}
+	
+	friend Vec3 operator/( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy /= right;
+	}
+	
+	friend Vec3 operator/( Arg left, float right )
+	{
+		Vec3 copy( left );
+		return copy /= right;
+	}
+	
+	friend float Dot( Arg left, Arg right )
+	{
+		return left.m_x*right.m_x + left.m_y*right.m_y + left.m_z*right.m_z;
+	}
+	
+	friend Vec3 Min( Arg left, Arg right )
+	{
+		return Vec3(
+			std::min( left.m_x, right.m_x ), 
+			std::min( left.m_y, right.m_y ), 
+			std::min( left.m_z, right.m_z )
+		);
+	}
+
+	friend Vec3 Max( Arg left, Arg right )
+	{
+		return Vec3(
+			std::max( left.m_x, right.m_x ), 
+			std::max( left.m_y, right.m_y ), 
+			std::max( left.m_z, right.m_z )
+		);
+	}
+
+	friend Vec3 Truncate( Arg v )
+	{
+		return Vec3(
+			v.m_x > 0.0f ? std::floor( v.m_x ) : std::ceil( v.m_x ), 
+			v.m_y > 0.0f ? std::floor( v.m_y ) : std::ceil( v.m_y ), 
+			v.m_z > 0.0f ? std::floor( v.m_z ) : std::ceil( v.m_z )
+		);
+	}
+
+private:
+	float m_x;
+	float m_y;
+	float m_z;
+};
+
+inline float LengthSquared( Vec3::Arg v )
+{
+	return Dot( v, v );
+}
+
+class Sym3x3
+{
+public:
+	Sym3x3()
+	{
+	}
+
+	Sym3x3( float s )
+	{
+		for( int i = 0; i < 6; ++i )
+			m_x[i] = s;
+	}
+
+	float operator[]( int index ) const
+	{
+		return m_x[index];
+	}
+
+	float& operator[]( int index )
+	{
+		return m_x[index];
+	}
+
+private:
+	float m_x[6];
+};
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights );
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix );
+
+} // namespace squish
+
+#endif // ndef SQUISH_MATHS_H
diff --git a/rangefit.cpp b/rangefit.cpp
new file mode 100644
index 0000000..3fca124
--- /dev/null
+++ b/rangefit.cpp
@@ -0,0 +1,201 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "rangefit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+RangeFit::RangeFit( ColourSet const* colours, int flags, float* metric ) 
+  : ColourFit( colours, flags )
+{
+	// initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+	if( metric )
+		m_metric = Vec3( metric[0], metric[1], metric[2] );
+	else
+		m_metric = Vec3( 1.0f );	
+
+	// initialise the best error
+	m_besterror = FLT_MAX;
+
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, weights );
+	
+	// compute the principle component
+	Vec3 principle = ComputePrincipleComponent( covariance );
+
+	// get the min and max range as the codebook endpoints
+	Vec3 start( 0.0f );
+	Vec3 end( 0.0f );
+	if( count > 0 )
+	{
+		float min, max;
+		
+		// compute the range
+		start = end = values[0];
+		min = max = Dot( values[0], principle );
+		for( int i = 1; i < count; ++i )
+		{
+			float val = Dot( values[i], principle );
+			if( val < min )
+			{
+				start = values[i];
+				min = val;
+			}
+			else if( val > max )
+			{
+				end = values[i];
+				max = val;
+			}
+		}
+	}
+			
+	// clamp the output to [0, 1]
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	start = Min( one, Max( zero, start ) );
+	end = Min( one, Max( zero, end ) );
+
+	// clamp to the grid and save
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+	Vec3 const half( 0.5f );
+	m_start = Truncate( grid*start + half )*gridrcp;
+	m_end = Truncate( grid*end + half )*gridrcp;
+}
+
+void RangeFit::Compress3( void* block )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// create a codebook
+	Vec3 codes[3];
+	codes[0] = m_start;
+	codes[1] = m_end;
+	codes[2] = 0.5f*m_start + 0.5f*m_end;
+
+	// match each point to the closest code
+	u8 closest[16];
+	float error = 0.0f;
+	for( int i = 0; i < count; ++i )
+	{
+		// find the closest code
+		float dist = FLT_MAX;
+		int idx = 0;
+		for( int j = 0; j < 3; ++j )
+		{
+			float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+			if( d < dist )
+			{
+				dist = d;
+				idx = j;
+			}
+		}
+		
+		// save the index
+		closest[i] = ( u8 )idx;
+		
+		// accumulate the error
+		error += dist;
+	}
+	
+	// save this scheme if it wins
+	if( error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( closest, indices );
+		
+		// save the block
+		WriteColourBlock3( m_start, m_end, indices, block );
+		
+		// save the error
+		m_besterror = error;
+	}
+}
+
+void RangeFit::Compress4( void* block )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// create a codebook
+	Vec3 codes[4];
+	codes[0] = m_start;
+	codes[1] = m_end;
+	codes[2] = ( 2.0f/3.0f )*m_start + ( 1.0f/3.0f )*m_end;
+	codes[3] = ( 1.0f/3.0f )*m_start + ( 2.0f/3.0f )*m_end;
+
+	// match each point to the closest code
+	u8 closest[16];
+	float error = 0.0f;
+	for( int i = 0; i < count; ++i )
+	{
+		// find the closest code
+		float dist = FLT_MAX;
+		int idx = 0;
+		for( int j = 0; j < 4; ++j )
+		{
+			float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+			if( d < dist )
+			{
+				dist = d;
+				idx = j;
+			}
+		}
+		
+		// save the index
+		closest[i] = ( u8 )idx;
+		
+		// accumulate the error
+		error += dist;
+	}
+	
+	// save this scheme if it wins
+	if( error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( closest, indices );
+		
+		// save the block
+		WriteColourBlock4( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = error;
+	}
+}
+
+} // namespace squish
diff --git a/rangefit.h b/rangefit.h
new file mode 100644
index 0000000..e293bdc
--- /dev/null
+++ b/rangefit.h
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_RANGEFIT_H
+#define SQUISH_RANGEFIT_H
+
+#include "squish.h"
+#include "colourfit.h"
+#include "maths.h"
+
+namespace squish {
+
+class ColourSet;
+
+class RangeFit : public ColourFit
+{
+public:
+	RangeFit( ColourSet const* colours, int flags, float* metric );
+	
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+	Vec3 m_metric;
+	Vec3 m_start;
+	Vec3 m_end;
+	float m_besterror;
+};
+
+} // squish
+
+#endif // ndef SQUISH_RANGEFIT_H
diff --git a/simd.h b/simd.h
new file mode 100644
index 0000000..22bd10a
--- /dev/null
+++ b/simd.h
@@ -0,0 +1,40 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_H
+#define SQUISH_SIMD_H
+
+#include "maths.h"
+
+#if SQUISH_USE_ALTIVEC
+#include "simd_ve.h"
+#elif SQUISH_USE_SSE
+#include "simd_sse.h"
+#else
+#include "simd_float.h"
+#endif
+
+
+#endif // ndef SQUISH_SIMD_H
diff --git a/simd_float.h b/simd_float.h
new file mode 100644
index 0000000..e6351b8
--- /dev/null
+++ b/simd_float.h
@@ -0,0 +1,183 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_FLOAT_H
+#define SQUISH_SIMD_FLOAT_H
+
+#include <algorithm>
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( X )
+
+class Vec4
+{
+public:
+	typedef Vec4 const& Arg;
+
+	Vec4() {}
+		
+	explicit Vec4( float s )
+	  : m_x( s ),
+		m_y( s ),
+		m_z( s ),
+		m_w( s )
+	{
+	}
+	
+	Vec4( float x, float y, float z, float w )
+	  : m_x( x ),
+		m_y( y ),
+		m_z( z ),
+		m_w( w )
+	{
+	}
+	
+	Vec3 GetVec3() const
+	{
+		return Vec3( m_x, m_y, m_z );
+	}
+	
+	Vec4 SplatX() const { return Vec4( m_x ); }
+	Vec4 SplatY() const { return Vec4( m_y ); }
+	Vec4 SplatZ() const { return Vec4( m_z ); }
+	Vec4 SplatW() const { return Vec4( m_w ); }
+
+	Vec4& operator+=( Arg v )
+	{
+		m_x += v.m_x;
+		m_y += v.m_y;
+		m_z += v.m_z;
+		m_w += v.m_w;
+		return *this;
+	}
+	
+	Vec4& operator-=( Arg v )
+	{
+		m_x -= v.m_x;
+		m_y -= v.m_y;
+		m_z -= v.m_z;
+		m_w -= v.m_w;
+		return *this;
+	}
+	
+	Vec4& operator*=( Arg v )
+	{
+		m_x *= v.m_x;
+		m_y *= v.m_y;
+		m_z *= v.m_z;
+		m_w *= v.m_w;
+		return *this;
+	}
+	
+	friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy += right;
+	}
+	
+	friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy -= right;
+	}
+	
+	friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy *= right;
+	}
+	
+	//! Returns a*b + c
+	friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return a*b + c;
+	}
+	
+	//! Returns -( a*b - c )
+	friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return c - a*b;
+	}
+	
+	friend Vec4 Reciprocal( Vec4::Arg v )
+	{
+		return Vec4( 
+			1.0f/v.m_x, 
+			1.0f/v.m_y, 
+			1.0f/v.m_z, 
+			1.0f/v.m_w 
+		);
+	}
+	
+	friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( 
+			std::min( left.m_x, right.m_x ), 
+			std::min( left.m_y, right.m_y ), 
+			std::min( left.m_z, right.m_z ), 
+			std::min( left.m_w, right.m_w ) 
+		);
+	}
+	
+	friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( 
+			std::max( left.m_x, right.m_x ), 
+			std::max( left.m_y, right.m_y ), 
+			std::max( left.m_z, right.m_z ), 
+			std::max( left.m_w, right.m_w ) 
+		);
+	}
+	
+	friend Vec4 Truncate( Vec4::Arg v )
+	{
+		return Vec4(
+			v.m_x > 0.0f ? std::floor( v.m_x ) : std::ceil( v.m_x ), 
+			v.m_y > 0.0f ? std::floor( v.m_y ) : std::ceil( v.m_y ), 
+			v.m_z > 0.0f ? std::floor( v.m_z ) : std::ceil( v.m_z ),
+			v.m_w > 0.0f ? std::floor( v.m_w ) : std::ceil( v.m_w )
+		);
+	}
+	
+	friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right ) 
+	{
+		return left.m_x < right.m_x
+			|| left.m_y < right.m_y
+			|| left.m_z < right.m_z
+			|| left.m_w < right.m_w;
+	}
+	
+private:
+	float m_x;
+	float m_y;
+	float m_z;
+	float m_w;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_FLOAT_H
+
diff --git a/simd_sse.h b/simd_sse.h
new file mode 100644
index 0000000..e584f2a
--- /dev/null
+++ b/simd_sse.h
@@ -0,0 +1,180 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_SSE_H
+#define SQUISH_SIMD_SSE_H
+
+#include <xmmintrin.h>
+#if ( SQUISH_USE_SSE > 1 )
+#include <emmintrin.h>
+#endif
+
+#define SQUISH_SSE_SPLAT( a )										\
+	( ( a ) | ( ( a ) << 2 ) | ( ( a ) << 4 ) | ( ( a ) << 6 ) )
+
+#define SQUISH_SSE_SHUF( x, y, z, w )								\
+	( ( x ) | ( ( y ) << 2 ) | ( ( z ) << 4 ) | ( ( w ) << 6 ) )
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( X )
+
+class Vec4
+{
+public:
+	typedef Vec4 const& Arg;
+
+	Vec4() {}
+		
+	explicit Vec4( __m128 v ) : m_v( v ) {}
+	
+	Vec4( Vec4 const& arg ) : m_v( arg.m_v ) {}
+	
+	Vec4& operator=( Vec4 const& arg )
+	{
+		m_v = arg.m_v;
+		return *this;
+	}
+	
+	explicit Vec4( float s ) : m_v( _mm_set1_ps( s ) ) {}
+	
+	Vec4( float x, float y, float z, float w ) : m_v( _mm_setr_ps( x, y, z, w ) ) {}
+	
+	Vec3 GetVec3() const
+	{
+#ifdef __GNUC__
+		__attribute__ ((__aligned__ (16))) float c[4];
+#else
+		__declspec(align(16)) float c[4];
+#endif
+		_mm_store_ps( c, m_v );
+		return Vec3( c[0], c[1], c[2] );
+	}
+	
+	Vec4 SplatX() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) ) ); }
+	Vec4 SplatY() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) ) ); }
+	Vec4 SplatZ() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 2 ) ) ); }
+	Vec4 SplatW() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 3 ) ) ); }
+
+	Vec4& operator+=( Arg v )
+	{
+		m_v = _mm_add_ps( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator-=( Arg v )
+	{
+		m_v = _mm_sub_ps( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator*=( Arg v )
+	{
+		m_v = _mm_mul_ps( m_v, v.m_v );
+		return *this;
+	}
+	
+	friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( _mm_add_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( _mm_sub_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( _mm_mul_ps( left.m_v, right.m_v ) );
+	}
+	
+	//! Returns a*b + c
+	friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( _mm_add_ps( _mm_mul_ps( a.m_v, b.m_v ), c.m_v ) );
+	}
+	
+	//! Returns -( a*b - c )
+	friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( _mm_sub_ps( c.m_v, _mm_mul_ps( a.m_v, b.m_v ) ) );
+	}
+	
+	friend Vec4 Reciprocal( Vec4::Arg v )
+	{
+		// get the reciprocal estimate
+		__m128 estimate = _mm_rcp_ps( v.m_v );
+
+		// one round of Newton-Rhaphson refinement
+		__m128 diff = _mm_sub_ps( _mm_set1_ps( 1.0f ), _mm_mul_ps( estimate, v.m_v ) );
+		return Vec4( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
+	}
+	
+	friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( _mm_min_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( _mm_max_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Truncate( Vec4::Arg v )
+	{
+#if ( SQUISH_USE_SSE == 1 )
+		// convert to ints
+		__m128 input = v.m_v;
+		__m64 lo = _mm_cvttps_pi32( input );
+		__m64 hi = _mm_cvttps_pi32( _mm_movehl_ps( input, input ) );
+
+		// convert to floats
+		__m128 part = _mm_movelh_ps( input, _mm_cvtpi32_ps( input, hi ) );
+		__m128 truncated = _mm_cvtpi32_ps( part, lo );
+		
+		// clear out the MMX multimedia state to allow FP calls later
+		_mm_empty(); 
+		return Vec4( truncated );
+#else
+		// use SSE2 instructions
+		return Vec4( _mm_cvtepi32_ps( _mm_cvttps_epi32( v.m_v ) ) );
+#endif
+	}
+	
+	friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right ) 
+	{
+		__m128 bits = _mm_cmplt_ps( left.m_v, right.m_v );
+		int value = _mm_movemask_ps( bits );
+		return value != 0;
+	}
+	
+private:
+	__m128 m_v;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_SSE_H
diff --git a/simd_ve.h b/simd_ve.h
new file mode 100644
index 0000000..70cb326
--- /dev/null
+++ b/simd_ve.h
@@ -0,0 +1,166 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_VE_H
+#define SQUISH_SIMD_VE_H
+
+#include <altivec.h>
+#undef bool
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( ( vector float ){ X } )
+
+class Vec4
+{
+public:
+	typedef Vec4 Arg;
+
+	Vec4() {}
+		
+	explicit Vec4( vector float v ) : m_v( v ) {}
+	
+	Vec4( Vec4 const& arg ) : m_v( arg.m_v ) {}
+	
+	Vec4& operator=( Vec4 const& arg )
+	{
+		m_v = arg.m_v;
+		return *this;
+	}
+	
+	explicit Vec4( float s )
+	{
+		union { vector float v; float c[4]; } u;
+		u.c[0] = s;
+		u.c[1] = s;
+		u.c[2] = s;
+		u.c[3] = s;
+		m_v = u.v;
+	}
+	
+	Vec4( float x, float y, float z, float w )
+	{
+		union { vector float v; float c[4]; } u;
+		u.c[0] = x;
+		u.c[1] = y;
+		u.c[2] = z;
+		u.c[3] = w;
+		m_v = u.v;
+	}
+	
+	Vec3 GetVec3() const
+	{
+		union { vector float v; float c[4]; } u;
+		u.v = m_v;
+		return Vec3( u.c[0], u.c[1], u.c[2] );
+	}
+	
+	Vec4 SplatX() const { return Vec4( vec_splat( m_v, 0 ) ); }
+	Vec4 SplatY() const { return Vec4( vec_splat( m_v, 1 ) ); }
+	Vec4 SplatZ() const { return Vec4( vec_splat( m_v, 2 ) ); }
+	Vec4 SplatW() const { return Vec4( vec_splat( m_v, 3 ) ); }
+
+	Vec4& operator+=( Arg v )
+	{
+		m_v = vec_add( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator-=( Arg v )
+	{
+		m_v = vec_sub( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator*=( Arg v )
+	{
+		m_v = vec_madd( m_v, v.m_v, ( vector float ){ -0.0f } );
+		return *this;
+	}
+	
+	friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( vec_add( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( vec_sub( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( vec_madd( left.m_v, right.m_v, ( vector float ){ -0.0f } ) );
+	}
+	
+	//! Returns a*b + c
+	friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( vec_madd( a.m_v, b.m_v, c.m_v ) );
+	}
+	
+	//! Returns -( a*b - c )
+	friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( vec_nmsub( a.m_v, b.m_v, c.m_v ) );
+	}
+	
+	friend Vec4 Reciprocal( Vec4::Arg v )
+	{
+		// get the reciprocal estimate
+		vector float estimate = vec_re( v.m_v );
+		
+		// one round of Newton-Rhaphson refinement
+		vector float diff = vec_nmsub( estimate, v.m_v, ( vector float ){ 1.0f } );
+		return Vec4( vec_madd( diff, estimate, estimate ) );
+	}
+	
+	friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( vec_min( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( vec_max( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Truncate( Vec4::Arg v )
+	{
+		return Vec4( vec_trunc( v.m_v ) );
+	}
+	
+	friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right ) 
+	{
+		return vec_any_lt( left.m_v, right.m_v ) != 0;
+	}
+	
+private:
+	vector float m_v;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_VE_H
diff --git a/singlecolourfit.cpp b/singlecolourfit.cpp
new file mode 100644
index 0000000..e8a0117
--- /dev/null
+++ b/singlecolourfit.cpp
@@ -0,0 +1,172 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "singlecolourfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+
+namespace squish {
+
+struct SourceBlock
+{
+	u8 start;
+	u8 end;
+	u8 error;
+};
+
+struct SingleColourLookup
+{
+	SourceBlock sources[2];
+};
+
+#include "singlecolourlookup.inl"
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+SingleColourFit::SingleColourFit( ColourSet const* colours, int flags )
+  : ColourFit( colours, flags )
+{
+	// grab the single colour
+	Vec3 const* values = m_colours->GetPoints();
+	m_colour[0] = ( u8 )FloatToInt( 255.0f*values->X(), 255 );
+	m_colour[1] = ( u8 )FloatToInt( 255.0f*values->Y(), 255 );
+	m_colour[2] = ( u8 )FloatToInt( 255.0f*values->Z(), 255 );
+		
+	// initialise the best error
+	m_besterror = INT_MAX;
+}
+
+void SingleColourFit::Compress3( void* block )
+{
+	// build the table of lookups
+	SingleColourLookup const* const lookups[] = 
+	{
+		lookup_5_3, 
+		lookup_6_3, 
+		lookup_5_3
+	};
+	
+	// find the best end-points and index
+	ComputeEndPoints( lookups );
+	
+	// build the block if we win
+	if( m_error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( &m_index, indices );
+		
+		// save the block
+		WriteColourBlock3( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = m_error;
+	}
+}
+
+void SingleColourFit::Compress4( void* block )
+{
+	// build the table of lookups
+	SingleColourLookup const* const lookups[] = 
+	{
+		lookup_5_4, 
+		lookup_6_4, 
+		lookup_5_4
+	};
+	
+	// find the best end-points and index
+	ComputeEndPoints( lookups );
+	
+	// build the block if we win
+	if( m_error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( &m_index, indices );
+		
+		// save the block
+		WriteColourBlock4( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = m_error;
+	}
+}
+
+void SingleColourFit::ComputeEndPoints( SingleColourLookup const* const* lookups )
+{
+	// check each index combination (endpoint or intermediate)
+	m_error = INT_MAX;
+	for( int index = 0; index < 2; ++index )
+	{
+		// check the error for this codebook index
+		SourceBlock const* sources[3];
+		int error = 0;
+		for( int channel = 0; channel < 3; ++channel )
+		{
+			// grab the lookup table and index for this channel
+			SingleColourLookup const* lookup = lookups[channel];
+			int target = m_colour[channel];
+			
+			// store a pointer to the source for this channel
+			sources[channel] = lookup[target].sources + index;
+			
+			// accumulate the error
+			int diff = sources[channel]->error;
+			error += diff*diff;			
+		}
+		
+		// keep it if the error is lower
+		if( error < m_error )
+		{
+			m_start = Vec3(
+				( float )sources[0]->start/31.0f, 
+				( float )sources[1]->start/63.0f, 
+				( float )sources[2]->start/31.0f
+			);
+			m_end = Vec3(
+				( float )sources[0]->end/31.0f, 
+				( float )sources[1]->end/63.0f, 
+				( float )sources[2]->end/31.0f
+			);
+			m_index = ( u8 )( 2*index );
+			m_error = error;
+		}
+	}
+}
+
+} // namespace squish
diff --git a/singlecolourfit.h b/singlecolourfit.h
new file mode 100644
index 0000000..54ec17e
--- /dev/null
+++ b/singlecolourfit.h
@@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SINGLECOLOURFIT_H
+#define SQUISH_SINGLECOLOURFIT_H
+
+#include "squish.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ColourSet;
+struct SingleColourLookup;
+
+class SingleColourFit : public ColourFit
+{
+public:
+	SingleColourFit( ColourSet const* colours, int flags );
+	
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+	void ComputeEndPoints( SingleColourLookup const* const* lookups );
+	
+	u8 m_colour[3];
+	Vec3 m_start;
+	Vec3 m_end;
+	u8 m_index;
+	int m_error;
+	int m_besterror;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SINGLECOLOURFIT_H
diff --git a/squish.cpp b/squish.cpp
new file mode 100644
index 0000000..2af83f3
--- /dev/null
+++ b/squish.cpp
@@ -0,0 +1,230 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include <squish.h>
+#include "colourset.h"
+#include "maths.h"
+#include "rangefit.h"
+#include "clusterfit.h"
+#include "colourblock.h"
+#include "alpha.h"
+#include "singlecolourfit.h"
+
+namespace squish {
+
+static int FixFlags( int flags )
+{
+	// grab the flag bits
+	int method = flags & ( kDxt1 | kDxt3 | kDxt5 );
+	int fit = flags & ( kColourIterativeClusterFit | kColourClusterFit | kColourRangeFit );
+	int extra = flags & kWeightColourByAlpha;
+	
+	// set defaults
+	if( method != kDxt3 && method != kDxt5 )
+		method = kDxt1;
+	if( fit != kColourRangeFit && fit != kColourIterativeClusterFit )
+		fit = kColourClusterFit;
+		
+	// done
+	return method | fit | extra;
+}
+
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// get the block locations
+	void* colourBlock = block;
+	void* alphaBock = block;
+	if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+		colourBlock = reinterpret_cast< u8* >( block ) + 8;
+
+	// create the minimal point set
+	ColourSet colours( rgba, mask, flags );
+	
+	// check the compression type and compress colour
+	if( colours.GetCount() == 1 )
+	{
+		// always do a single colour fit
+		SingleColourFit fit( &colours, flags );
+		fit.Compress( colourBlock );
+	}
+	else if( ( flags & kColourRangeFit ) != 0 || colours.GetCount() == 0 )
+	{
+		// do a range fit
+		RangeFit fit( &colours, flags, metric );
+		fit.Compress( colourBlock );
+	}
+	else
+	{
+		// default to a cluster fit (could be iterative or not)
+		ClusterFit fit( &colours, flags, metric );
+		fit.Compress( colourBlock );
+	}
+	
+	// compress alpha separately if necessary
+	if( ( flags & kDxt3 ) != 0 )
+		CompressAlphaDxt3( rgba, mask, alphaBock );
+	else if( ( flags & kDxt5 ) != 0 )
+		CompressAlphaDxt5( rgba, mask, alphaBock );
+}
+
+void Decompress( u8* rgba, void const* block, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// get the block locations
+	void const* colourBlock = block;
+	void const* alphaBock = block;
+	if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+		colourBlock = reinterpret_cast< u8 const* >( block ) + 8;
+
+	// decompress colour
+	DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
+
+	// decompress alpha separately if necessary
+	if( ( flags & kDxt3 ) != 0 )
+		DecompressAlphaDxt3( rgba, alphaBock );
+	else if( ( flags & kDxt5 ) != 0 )
+		DecompressAlphaDxt5( rgba, alphaBock );
+}
+
+int GetStorageRequirements( int width, int height, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+	
+	// compute the storage requirements
+	int blockcount = ( ( width + 3 )/4 ) * ( ( height + 3 )/4 );
+	int blocksize = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+	return blockcount*blocksize;	
+}
+
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// initialise the block output
+	u8* targetBlock = reinterpret_cast< u8* >( blocks );
+	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+
+	// loop over blocks
+	for( int y = 0; y < height; y += 4 )
+	{
+		for( int x = 0; x < width; x += 4 )
+		{
+			// build the 4x4 block of pixels
+			u8 sourceRgba[16*4];
+			u8* targetPixel = sourceRgba;
+			int mask = 0;
+			for( int py = 0; py < 4; ++py )
+			{
+				for( int px = 0; px < 4; ++px )
+				{
+					// get the source pixel in the image
+					int sx = x + px;
+					int sy = y + py;
+					
+					// enable if we're in the image
+					if( sx < width && sy < height )
+					{
+						// copy the rgba value
+						u8 const* sourcePixel = rgba + 4*( width*sy + sx );
+						for( int i = 0; i < 4; ++i )
+							*targetPixel++ = *sourcePixel++;
+							
+						// enable this pixel
+						mask |= ( 1 << ( 4*py + px ) );
+					}
+					else
+					{
+						// skip this pixel as its outside the image
+						targetPixel += 4;
+					}
+				}
+			}
+			
+			// compress it into the output
+			CompressMasked( sourceRgba, mask, targetBlock, flags, metric );
+			
+			// advance
+			targetBlock += bytesPerBlock;
+		}
+	}
+}
+
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// initialise the block input
+	u8 const* sourceBlock = reinterpret_cast< u8 const* >( blocks );
+	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+
+	// loop over blocks
+	for( int y = 0; y < height; y += 4 )
+	{
+		for( int x = 0; x < width; x += 4 )
+		{
+			// decompress the block
+			u8 targetRgba[4*16];
+			Decompress( targetRgba, sourceBlock, flags );
+			
+			// write the decompressed pixels to the correct image locations
+			u8 const* sourcePixel = targetRgba;
+			for( int py = 0; py < 4; ++py )
+			{
+				for( int px = 0; px < 4; ++px )
+				{
+					// get the target location
+					int sx = x + px;
+					int sy = y + py;
+					if( sx < width && sy < height )
+					{
+						u8* targetPixel = rgba + 4*( width*sy + sx );
+						
+						// copy the rgba value
+						for( int i = 0; i < 4; ++i )
+							*targetPixel++ = *sourcePixel++;
+					}
+					else
+					{
+						// skip this pixel as its outside the image
+						sourcePixel += 4;
+					}
+				}
+			}
+			
+			// advance
+			sourceBlock += bytesPerBlock;
+		}
+	}
+}
+
+} // namespace squish
diff --git a/squish.h b/squish.h
new file mode 100644
index 0000000..212b54c
--- /dev/null
+++ b/squish.h
@@ -0,0 +1,263 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_H
+#define SQUISH_H
+
+//! All squish API functions live in this namespace.
+namespace squish {
+
+// -----------------------------------------------------------------------------
+
+//! Typedef a quantity that is a single unsigned byte.
+typedef unsigned char u8;
+
+// -----------------------------------------------------------------------------
+
+enum
+{
+	//! Use DXT1 compression.
+	kDxt1 = ( 1 << 0 ), 
+	
+	//! Use DXT3 compression.
+	kDxt3 = ( 1 << 1 ), 
+	
+	//! Use DXT5 compression.
+	kDxt5 = ( 1 << 2 ), 
+	
+	//! Use a very slow but very high quality colour compressor.
+	kColourIterativeClusterFit = ( 1 << 8 ),	
+	
+	//! Use a slow but high quality colour compressor (the default).
+	kColourClusterFit = ( 1 << 3 ),	
+	
+	//! Use a fast but low quality colour compressor.
+	kColourRangeFit	= ( 1 << 4 ),
+	
+	//! Weight the colour by alpha during cluster fit (disabled by default).
+	kWeightColourByAlpha = ( 1 << 7 )
+};
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+	@param rgba		The rgba values of the 16 source pixels.
+	@param mask		The valid pixel mask.
+	@param block	Storage for the compressed DXT block.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+		
+	The mask parameter enables only certain pixels within the block. The lowest
+	bit enables the first pixel and so on up to the 16th bit. Bits beyond the
+	16th bit are ignored. Pixels that are not enabled are allowed to take
+	arbitrary colours in the output block. An example of how this can be used
+	is in the CompressImage function to disable pixels outside the bounds of
+	the image when the width or height is not divisible by 4.
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for the compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+*/
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric = 0 );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+	@param rgba		The rgba values of the 16 source pixels.
+	@param block	Storage for the compressed DXT block.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for the compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+	
+	This method is an inline that calls CompressMasked with a mask of 0xffff, 
+	provided for compatibility with older versions of squish.
+*/
+inline void Compress( u8 const* rgba, void* block, int flags, float* metric = 0 )
+{
+	CompressMasked( rgba, 0xffff, block, flags, metric );
+}
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses a 4x4 block of pixels.
+
+	@param rgba		Storage for the 16 decompressed pixels.
+	@param block	The compressed DXT block.
+	@param flags	Compression flags.
+
+	The decompressed pixels will be written as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this is:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+*/
+void Decompress( u8* rgba, void const* block, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Computes the amount of compressed storage required.
+
+	@param width	The width of the image.
+	@param height	The height of the image.
+	@param flags	Compression flags.
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+	
+	Most DXT images will be a multiple of 4 in each dimension, but this 
+	function supports arbitrary size images by allowing the outer blocks to
+	be only partially used.
+*/
+int GetStorageRequirements( int width, int height, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses an image in memory.
+
+	@param rgba		The pixels of the source.
+	@param width	The width of the source image.
+	@param height	The height of the source image.
+	@param blocks	Storage for the compressed output.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of width*height
+	rgba values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+		
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for each compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+	
+	Internally this function calls squish::CompressMasked for each block, which 
+	allows for pixels outside the image to take arbitrary values. The function 
+	squish::GetStorageRequirements can be called to compute the amount of memory
+	to allocate for the compressed output.
+*/
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric = 0 );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses an image in memory.
+
+	@param rgba		Storage for the decompressed pixels.
+	@param width	The width of the source image.
+	@param height	The height of the source image.
+	@param blocks	The compressed DXT blocks.
+	@param flags	Compression flags.
+	
+	The decompressed pixels will be written as a contiguous array of width*height
+	16 rgba values, with each component as 1 byte each. In memory this is:
+	
+		{ r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+		
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+
+	Internally this function calls squish::Decompress for each block.
+*/
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags );
+
+// -----------------------------------------------------------------------------
+
+} // namespace squish
+
+#endif // ndef SQUISH_H
+