diff --git a/clusterfit.cpp b/clusterfit.cpp
index 9670446..b5c685a 100644
--- a/clusterfit.cpp
+++ b/clusterfit.cpp
@@ -27,6 +27,7 @@
 #include "clusterfit.h"
 #include "colourset.h"
 #include "colourblock.h"
+#include "colourblockGCN.h"
 #include <cfloat>
 
 namespace squish {
@@ -237,7 +238,10 @@ void ClusterFit::Compress3( void* block )
 		m_colours->RemapIndices( unordered, bestindices );
 		
 		// save the block
-		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+		if ( ( m_flags & kDxt1GCN ) != 0 )
+			WriteColourBlock3GCN( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+		else
+			WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
 
 		// save the error
 		m_besterror = besterror;
@@ -382,7 +386,10 @@ void ClusterFit::Compress4( void* block )
 		m_colours->RemapIndices( unordered, bestindices );
 		
 		// save the block
-		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+		if ( ( m_flags & kDxt1GCN ) != 0 )
+			WriteColourBlock4GCN( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+		else
+			WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
 
 		// save the error
 		m_besterror = besterror;
diff --git a/colourblockGCN.cpp b/colourblockGCN.cpp
index 5d96dd6..3b57b7b 100644
--- a/colourblockGCN.cpp
+++ b/colourblockGCN.cpp
@@ -1,28 +1,28 @@
 /* -----------------------------------------------------------------------------
 
-    Fork of colourblock.cpp from libSquish.. modified to encode/decode DXT1
-    packed for the Nintendo GameCube's GX hardware.
+	Fork of colourblock.cpp from libSquish.. modified to encode/decode DXT1
+	packed for the Nintendo GameCube's GX hardware.
 
-    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
 
-    Permission is hereby granted, free of charge, to any person obtaining
-    a copy of this software and associated documentation files (the
-    "Software"), to	deal in the Software without restriction, including
-    without limitation the rights to use, copy, modify, merge, publish,
-    distribute, sublicense, and/or sell copies of the Software, and to
-    permit persons to whom the Software is furnished to do so, subject to
-    the following conditions:
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to
+	permit persons to whom the Software is furnished to do so, subject to
+	the following conditions:
 
-    The above copyright notice and this permission notice shall be included
-    in all copies or substantial portions of the Software.
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
 
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
    -------------------------------------------------------------------------- */
 
@@ -32,188 +32,188 @@ namespace squish {
 
 static int FloatToInt( float a, int limit )
 {
-    // use ANSI round-to-zero behaviour to get round-to-nearest
-    int i = ( int )( a + 0.5f );
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
 
-    // clamp to the limit
-    if( i < 0 )
-        i = 0;
-    else if( i > limit )
-        i = limit;
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit;
 
-    // done
-    return i;
+	// done
+	return i;
 }
 
 static int FloatTo565( Vec3::Arg colour )
 {
-    // get the components in the correct range
-    int r = FloatToInt( 31.0f*colour.X(), 31 );
-    int g = FloatToInt( 63.0f*colour.Y(), 63 );
-    int b = FloatToInt( 31.0f*colour.Z(), 31 );
+	// get the components in the correct range
+	int r = FloatToInt( 31.0f*colour.X(), 31 );
+	int g = FloatToInt( 63.0f*colour.Y(), 63 );
+	int b = FloatToInt( 31.0f*colour.Z(), 31 );
 
-    // pack into a single value
-    return ( r << 11 ) | ( g << 5 ) | b;
+	// pack into a single value
+	return ( r << 11 ) | ( g << 5 ) | b;
 }
 
 static void WriteColourBlock( int a, int b, u8* indices, void* block )
 {
-    // get the block as bytes
-    u8* bytes = ( u8* )block;
+	// get the block as bytes
+	u8* bytes = ( u8* )block;
 
-    // write the endpoints - GCN 16-bit words byte-swapped
-    bytes[1] = ( u8 )( a & 0xff );
-    bytes[0] = ( u8 )( a >> 8 );
-    bytes[3] = ( u8 )( b & 0xff );
-    bytes[2] = ( u8 )( b >> 8 );
+	// write the endpoints - GCN: 16-bit words byte-swapped
+	bytes[1] = ( u8 )( a & 0xff );
+	bytes[0] = ( u8 )( a >> 8 );
+	bytes[3] = ( u8 )( b & 0xff );
+	bytes[2] = ( u8 )( b >> 8 );
 
-    // write the indices
-    for( int i = 0; i < 4; ++i )
-    {
-        u8 const* ind = indices + 4*i;
-        // GCN: indices reversed
-        bytes[4 + i] = ind[3] | ( ind[2] << 2 ) | ( ind[1] << 4 ) | ( ind[0] << 6 );
-    }
+	// write the indices
+	for( int i = 0; i < 4; ++i )
+	{
+		u8 const* ind = indices + 4*i;
+		// GCN: indices reversed
+		bytes[4 + i] = ind[3] | ( ind[2] << 2 ) | ( ind[1] << 4 ) | ( ind[0] << 6 );
+	}
 }
 
 void WriteColourBlock3GCN( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
 {
-    // get the packed values
-    int a = FloatTo565( start );
-    int b = FloatTo565( end );
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
 
-    // remap the indices
-    u8 remapped[16];
-    if( a <= b )
-    {
-        // use the indices directly
-        for( int i = 0; i < 16; ++i )
-            remapped[i] = indices[i];
-    }
-    else
-    {
-        // swap a and b
-        std::swap( a, b );
-        for( int i = 0; i < 16; ++i )
-        {
-            if( indices[i] == 0 )
-                remapped[i] = 1;
-            else if( indices[i] == 1 )
-                remapped[i] = 0;
-            else
-                remapped[i] = indices[i];
-        }
-    }
+	// remap the indices
+	u8 remapped[16];
+	if( a <= b )
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	else
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+		{
+			if( indices[i] == 0 )
+				remapped[i] = 1;
+			else if( indices[i] == 1 )
+				remapped[i] = 0;
+			else
+				remapped[i] = indices[i];
+		}
+	}
 
-    // write the block
-    WriteColourBlock( a, b, remapped, block );
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
 }
 
 void WriteColourBlock4GCN( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
 {
-    // get the packed values
-    int a = FloatTo565( start );
-    int b = FloatTo565( end );
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
 
-    // remap the indices
-    u8 remapped[16];
-    if( a < b )
-    {
-        // swap a and b
-        std::swap( a, b );
-        for( int i = 0; i < 16; ++i )
-            remapped[i] = ( indices[i] ^ 0x1 ) & 0x3;
-    }
-    else if( a == b )
-    {
-        // use index 0
-        for( int i = 0; i < 16; ++i )
-            remapped[i] = 0;
-    }
-    else
-    {
-        // use the indices directly
-        for( int i = 0; i < 16; ++i )
-            remapped[i] = indices[i];
-    }
+	// remap the indices
+	u8 remapped[16];
+	if( a < b )
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = ( indices[i] ^ 0x1 ) & 0x3;
+	}
+	else if( a == b )
+	{
+		// use index 0
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = 0;
+	}
+	else
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
 
-    // write the block
-    WriteColourBlock( a, b, remapped, block );
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
 }
 
 static int Unpack565( u8 const* packed, u8* colour )
 {
-    // build the packed value - GCN: indices reversed
-    int value = ( int )packed[1] | ( ( int )packed[0] << 8 );
+	// build the packed value - GCN: indices reversed
+	int value = ( int )packed[1] | ( ( int )packed[0] << 8 );
 
-    // get the components in the stored range
-    u8 red = ( u8 )( ( value >> 11 ) & 0x1f );
-    u8 green = ( u8 )( ( value >> 5 ) & 0x3f );
-    u8 blue = ( u8 )( value & 0x1f );
+	// get the components in the stored range
+	u8 red = ( u8 )( ( value >> 11 ) & 0x1f );
+	u8 green = ( u8 )( ( value >> 5 ) & 0x3f );
+	u8 blue = ( u8 )( value & 0x1f );
 
-    // scale up to 8 bits
-    colour[0] = ( red << 3 ) | ( red >> 2 );
-    colour[1] = ( green << 2 ) | ( green >> 4 );
-    colour[2] = ( blue << 3 ) | ( blue >> 2 );
-    colour[3] = 255;
+	// scale up to 8 bits
+	colour[0] = ( red << 3 ) | ( red >> 2 );
+	colour[1] = ( green << 2 ) | ( green >> 4 );
+	colour[2] = ( blue << 3 ) | ( blue >> 2 );
+	colour[3] = 255;
 
-    // return the value
-    return value;
+	// return the value
+	return value;
 }
 
 void DecompressColourGCN( u8* rgba, void const* block )
 {
-    // get the block bytes
-    u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	// get the block bytes
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
 
-    // unpack the endpoints
-    u8 codes[16];
-    int a = Unpack565( bytes, codes );
-    int b = Unpack565( bytes + 2, codes + 4 );
+	// unpack the endpoints
+	u8 codes[16];
+	int a = Unpack565( bytes, codes );
+	int b = Unpack565( bytes + 2, codes + 4 );
 
-    // generate the midpoints
-    for( int i = 0; i < 3; ++i )
-    {
-        int c = codes[i];
-        int d = codes[4 + i];
+	// generate the midpoints
+	for( int i = 0; i < 3; ++i )
+	{
+		int c = codes[i];
+		int d = codes[4 + i];
 
-        if( a <= b )
-        {
-            codes[8 + i] = ( u8 )( ( c + d )/2 );
-            codes[12 + i] = 0;
-        }
-        else
-        {
-            codes[8 + i] = ( u8 )( ( 2*c + d )/3 );
-            codes[12 + i] = ( u8 )( ( c + 2*d )/3 );
-        }
-    }
+		if( a <= b )
+		{
+			codes[8 + i] = ( u8 )( ( c + d )/2 );
+			codes[12 + i] = 0;
+		}
+		else
+		{
+			codes[8 + i] = ( u8 )( ( 2*c + d )/3 );
+			codes[12 + i] = ( u8 )( ( c + 2*d )/3 );
+		}
+	}
 
-    // fill in alpha for the intermediate values
-    codes[8 + 3] = 255;
-    codes[12 + 3] = ( a <= b ) ? 0 : 255;
+	// fill in alpha for the intermediate values
+	codes[8 + 3] = 255;
+	codes[12 + 3] = ( a <= b ) ? 0 : 255;
 
-    // unpack the indices
-    u8 indices[16];
-    for( int i = 0; i < 4; ++i )
-    {
-        u8* ind = indices + 4*i;
-        u8 packed = bytes[4 + i];
+	// unpack the indices
+	u8 indices[16];
+	for( int i = 0; i < 4; ++i )
+	{
+		u8* ind = indices + 4*i;
+		u8 packed = bytes[4 + i];
 
-        // GCN: indices reversed
-        ind[3] = packed & 0x3;
-        ind[2] = ( packed >> 2 ) & 0x3;
-        ind[1] = ( packed >> 4 ) & 0x3;
-        ind[0] = ( packed >> 6 ) & 0x3;
-    }
+		// GCN: indices reversed
+		ind[3] = packed & 0x3;
+		ind[2] = ( packed >> 2 ) & 0x3;
+		ind[1] = ( packed >> 4 ) & 0x3;
+		ind[0] = ( packed >> 6 ) & 0x3;
+	}
 
-    // store out the colours
-    for( int i = 0; i < 16; ++i )
-    {
-        u8 offset = 4*indices[i];
-        for( int j = 0; j < 4; ++j )
-            rgba[4*i + j] = codes[offset + j];
-    }
+	// store out the colours
+	for( int i = 0; i < 16; ++i )
+	{
+		u8 offset = 4*indices[i];
+		for( int j = 0; j < 4; ++j )
+			rgba[4*i + j] = codes[offset + j];
+	}
 }
 
 } // namespace squish
diff --git a/colourfit.cpp b/colourfit.cpp
index 11efa46..4bf112b 100644
--- a/colourfit.cpp
+++ b/colourfit.cpp
@@ -40,7 +40,7 @@ ColourFit::~ColourFit()
 
 void ColourFit::Compress( void* block )
 {
-	bool isDxt1 = ( ( m_flags & kDxt1 ) != 0 );
+	bool isDxt1 = ( ( m_flags & ( kDxt1 | kDxt1GCN ) ) != 0 );
 	if( isDxt1 )
 	{
 		Compress3( block );
diff --git a/colourset.cpp b/colourset.cpp
index 97d29d9..b086c9a 100644
--- a/colourset.cpp
+++ b/colourset.cpp
@@ -32,7 +32,7 @@ ColourSet::ColourSet( u8 const* rgba, int mask, int flags )
 	m_transparent( false )
 {
 	// check the compression mode for dxt1
-	bool isDxt1 = ( ( flags & kDxt1 ) != 0 );
+	bool isDxt1 = ( ( flags & ( kDxt1 | kDxt1GCN ) ) != 0 );
 	bool weightByAlpha = ( ( flags & kWeightColourByAlpha ) != 0 );
 
 	// create the minimal set
diff --git a/rangefit.cpp b/rangefit.cpp
index 3fca124..ad1b15d 100644
--- a/rangefit.cpp
+++ b/rangefit.cpp
@@ -26,6 +26,7 @@
 #include "rangefit.h"
 #include "colourset.h"
 #include "colourblock.h"
+#include "colourblockGCN.h"
 #include <cfloat>
 
 namespace squish {
@@ -138,8 +139,11 @@ void RangeFit::Compress3( void* block )
 		m_colours->RemapIndices( closest, indices );
 		
 		// save the block
-		WriteColourBlock3( m_start, m_end, indices, block );
-		
+		if ( ( m_flags & kDxt1GCN ) != 0 )
+			WriteColourBlock3GCN( m_start, m_end, indices, block );
+		else
+			WriteColourBlock3( m_start, m_end, indices, block );
+
 		// save the error
 		m_besterror = error;
 	}
@@ -191,7 +195,10 @@ void RangeFit::Compress4( void* block )
 		m_colours->RemapIndices( closest, indices );
 		
 		// save the block
-		WriteColourBlock4( m_start, m_end, indices, block );
+		if ( ( m_flags & kDxt1GCN ) != 0 )
+			WriteColourBlock4GCN( m_start, m_end, indices, block );
+		else
+			WriteColourBlock4( m_start, m_end, indices, block );
 
 		// save the error
 		m_besterror = error;
diff --git a/singlecolourfit.cpp b/singlecolourfit.cpp
index e8a0117..eff31e3 100644
--- a/singlecolourfit.cpp
+++ b/singlecolourfit.cpp
@@ -26,6 +26,7 @@
 #include "singlecolourfit.h"
 #include "colourset.h"
 #include "colourblock.h"
+#include "colourblockGCN.h"
 
 namespace squish {
 
@@ -92,7 +93,10 @@ void SingleColourFit::Compress3( void* block )
 		m_colours->RemapIndices( &m_index, indices );
 		
 		// save the block
-		WriteColourBlock3( m_start, m_end, indices, block );
+		if ( ( m_flags & kDxt1GCN ) != 0 )
+			WriteColourBlock3GCN( m_start, m_end, indices, block );
+		else
+			WriteColourBlock3( m_start, m_end, indices, block );
 
 		// save the error
 		m_besterror = m_error;
@@ -120,7 +124,10 @@ void SingleColourFit::Compress4( void* block )
 		m_colours->RemapIndices( &m_index, indices );
 		
 		// save the block
-		WriteColourBlock4( m_start, m_end, indices, block );
+		if ( ( m_flags & kDxt1GCN ) != 0 )
+			WriteColourBlock4GCN( m_start, m_end, indices, block );
+		else
+			WriteColourBlock4( m_start, m_end, indices, block );
 
 		// save the error
 		m_besterror = m_error;
diff --git a/squish.cpp b/squish.cpp
index 2af83f3..46f6dac 100644
--- a/squish.cpp
+++ b/squish.cpp
@@ -29,6 +29,7 @@
 #include "rangefit.h"
 #include "clusterfit.h"
 #include "colourblock.h"
+#include "colourblockGCN.h"
 #include "alpha.h"
 #include "singlecolourfit.h"
 
@@ -37,12 +38,12 @@ namespace squish {
 static int FixFlags( int flags )
 {
 	// grab the flag bits
-	int method = flags & ( kDxt1 | kDxt3 | kDxt5 );
+	int method = flags & ( kDxt1 | kDxt3 | kDxt5 | kDxt1GCN );
 	int fit = flags & ( kColourIterativeClusterFit | kColourClusterFit | kColourRangeFit );
 	int extra = flags & kWeightColourByAlpha;
 	
 	// set defaults
-	if( method != kDxt3 && method != kDxt5 )
+	if( method != kDxt3 && method != kDxt5 && method != kDxt1GCN )
 		method = kDxt1;
 	if( fit != kColourRangeFit && fit != kColourIterativeClusterFit )
 		fit = kColourClusterFit;
@@ -104,7 +105,10 @@ void Decompress( u8* rgba, void const* block, int flags )
 		colourBlock = reinterpret_cast< u8 const* >( block ) + 8;
 
 	// decompress colour
-	DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
+	if ( ( flags & kDxt1GCN ) != 0 )
+		DecompressColourGCN( rgba, colourBlock );
+	else
+		DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
 
 	// decompress alpha separately if necessary
 	if( ( flags & kDxt3 ) != 0 )
@@ -120,7 +124,7 @@ int GetStorageRequirements( int width, int height, int flags )
 	
 	// compute the storage requirements
 	int blockcount = ( ( width + 3 )/4 ) * ( ( height + 3 )/4 );
-	int blocksize = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+	int blocksize = ( ( flags & ( kDxt1 | kDxt1GCN ) ) != 0 ) ? 8 : 16;
 	return blockcount*blocksize;	
 }
 
@@ -131,7 +135,7 @@ void CompressImage( u8 const* rgba, int width, int height, void* blocks, int fla
 
 	// initialise the block output
 	u8* targetBlock = reinterpret_cast< u8* >( blocks );
-	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+	int bytesPerBlock = ( ( flags & ( kDxt1 | kDxt1GCN ) ) != 0 ) ? 8 : 16;
 
 	// loop over blocks
 	for( int y = 0; y < height; y += 4 )
@@ -185,7 +189,7 @@ void DecompressImage( u8* rgba, int width, int height, void const* blocks, int f
 
 	// initialise the block input
 	u8 const* sourceBlock = reinterpret_cast< u8 const* >( blocks );
-	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+	int bytesPerBlock = ( ( flags & ( kDxt1 | kDxt1GCN ) ) != 0 ) ? 8 : 16;
 
 	// loop over blocks
 	for( int y = 0; y < height; y += 4 )
diff --git a/squish.h b/squish.h
index 212b54c..8bbc976 100644
--- a/squish.h
+++ b/squish.h
@@ -46,6 +46,9 @@ enum
 	
 	//! Use DXT5 compression.
 	kDxt5 = ( 1 << 2 ), 
+
+	//! Use DXT1 compression with GCN byte-ordering
+	kDxt1GCN = ( 1 << 9 ),
 	
 	//! Use a very slow but very high quality colour compressor.
 	kColourIterativeClusterFit = ( 1 << 8 ),