Updated SDL's YUV support, many thanks to Adrien Descamps

New functions get and set the YUV colorspace conversion mode: SDL_SetYUVConversionMode() SDL_GetYUVConversionMode() SDL_GetYUVConversionModeForResolution() SDL_ConvertPixels() converts between all supported RGB and YUV formats, with SSE acceleration for converting from planar YUV formats (YV12, NV12, etc) to common RGB/RGBA formats. Added a new test program, testyuv, to verify correctness and speed of YUV conversion functionality.
2025-12-18 09:25:29 +00:00 · 2017-11-12 22:51:12 -08:00
parent c317ab978f
commit a6a4e27ae8
60 changed files with 8368 additions and 4310 deletions
--- a/src/video/SDL_pixels.c
+++ b/src/video/SDL_pixels.c
@@ -748,30 +748,6 @@ SDL_DitherColors(SDL_Color * colors, int bpp)
    }
 }

-/*
- * Calculate the pad-aligned scanline width of a surface
- */
-int
-SDL_CalculatePitch(SDL_Surface * surface)
-{
-    int pitch;
-
-    /* Surface should be 4-byte aligned for speed */
-    pitch = surface->w * surface->format->BytesPerPixel;
-    switch (surface->format->BitsPerPixel) {
-    case 1:
-        pitch = (pitch + 7) / 8;
-        break;
-    case 4:
-        pitch = (pitch + 1) / 2;
-        break;
-    default:
-        break;
-    }
-    pitch = (pitch + 3) & ~3;   /* 4-byte aligning */
-    return (pitch);
-}
-
 /*
 * Match an RGB value to a particular palette index
 */
--- a/src/video/SDL_pixels_c.h
+++ b/src/video/SDL_pixels_c.h
@@ -34,7 +34,6 @@ extern int SDL_MapSurface(SDL_Surface * src, SDL_Surface * dst);
 extern void SDL_FreeBlitMap(SDL_BlitMap * map);

 /* Miscellaneous functions */
-extern int SDL_CalculatePitch(SDL_Surface * surface);
 extern void SDL_DitherColors(SDL_Color * colors, int bpp);
 extern Uint8 SDL_FindColor(SDL_Palette * pal, Uint8 r, Uint8 g, Uint8 b, Uint8 a);

--- a/src/video/SDL_surface.c
+++ b/src/video/SDL_surface.c
@@ -25,17 +25,8 @@
 #include "SDL_blit.h"
 #include "SDL_RLEaccel_c.h"
 #include "SDL_pixels_c.h"
+#include "SDL_yuv_c.h"

-/* Private routines */
-static int
-SDL_ConvertPixels_YUV_to_ARGB8888(int width, int height,
-        Uint32 src_format, const void *src, 
-        void *dst, int dst_pitch);
-
-static int 
-SDL_ConvertPixels_ARGB8888_to_YUV(int width, int height, 
-        const void *src, int src_pitch,
-        Uint32 dst_format, void *dst);

 /* Check to make sure we can safely check multiplication of surface w and pitch and it won't overflow size_t */
 SDL_COMPILE_TIME_ASSERT(surface_size_assumptions,
@@ -43,6 +34,30 @@ SDL_COMPILE_TIME_ASSERT(surface_size_assumptions,

 /* Public routines */

+/*
+ * Calculate the pad-aligned scanline width of a surface
+ */
+int
+SDL_CalculatePitch(Uint32 format, int width)
+{
+    int pitch;
+
+    /* Surface should be 4-byte aligned for speed */
+    pitch = width * SDL_BYTESPERPIXEL(format);
+    switch (SDL_BITSPERPIXEL(format)) {
+    case 1:
+        pitch = (pitch + 7) / 8;
+        break;
+    case 4:
+        pitch = (pitch + 1) / 2;
+        break;
+    default:
+        break;
+    }
+    pitch = (pitch + 3) & ~3;   /* 4-byte aligning */
+    return pitch;
+}
+
 /*
 * Create an empty RGB surface of the appropriate depth using the given
 * enum SDL_PIXELFORMAT_* format
@@ -70,7 +85,7 @@ SDL_CreateRGBSurfaceWithFormat(Uint32 flags, int width, int height, int depth,
    }
    surface->w = width;
    surface->h = height;
-    surface->pitch = SDL_CalculatePitch(surface);
+    surface->pitch = SDL_CalculatePitch(format, width);
    SDL_SetClipRect(surface, NULL);

    if (SDL_ISPIXELFORMAT_INDEXED(surface->format->format)) {
@@ -1138,135 +1153,27 @@ int SDL_ConvertPixels(int width, int height,
        return SDL_InvalidParamError("dst_pitch");
    }

+    if (SDL_ISPIXELFORMAT_FOURCC(src_format) && SDL_ISPIXELFORMAT_FOURCC(dst_format)) {
+        return SDL_ConvertPixels_YUV_to_YUV(width, height, src_format, src, src_pitch, dst_format, dst, dst_pitch);
+    } else if (SDL_ISPIXELFORMAT_FOURCC(src_format)) {
+        return SDL_ConvertPixels_YUV_to_RGB(width, height, src_format, src, src_pitch, dst_format, dst, dst_pitch);
+    } else if (SDL_ISPIXELFORMAT_FOURCC(dst_format)) {
+        return SDL_ConvertPixels_RGB_to_YUV(width, height, src_format, src, src_pitch, dst_format, dst, dst_pitch);
+    }
+
    /* Fast path for same format copy */
    if (src_format == dst_format) {
        int i;
-
-        if (SDL_ISPIXELFORMAT_FOURCC(src_format)) {
-            switch (src_format) {
-            case SDL_PIXELFORMAT_YUY2:
-            case SDL_PIXELFORMAT_UYVY:
-            case SDL_PIXELFORMAT_YVYU:
-                /* Packed planes */
-                width = 4 * ((width + 1) / 2);
-                for (i = height; i--;) {
-                    SDL_memcpy(dst, src, width);
-                    src = (const Uint8*)src + src_pitch;
-                    dst = (Uint8*)dst + dst_pitch;
-                }
-                break;
-            case SDL_PIXELFORMAT_YV12:
-            case SDL_PIXELFORMAT_IYUV:
-            case SDL_PIXELFORMAT_NV12:
-            case SDL_PIXELFORMAT_NV21:
-                {
-                    /* Y plane */
-                    for (i = height; i--;) {
-                        SDL_memcpy(dst, src, width);
-                        src = (const Uint8*)src + src_pitch;
-                        dst = (Uint8*)dst + dst_pitch;
-                    }
-
-                    /* not sure the pitch is relevant here.
-                       this also works to add the size of two chroma planes */
-#if 0
-                    SDL_memcpy(dst, src, 2 * ((width + 1)/2) * ((height+1)/2));
-#else
-
-                    if (src_format == SDL_PIXELFORMAT_YV12 || src_format == SDL_PIXELFORMAT_IYUV) {
-                        /* U and V planes are a quarter the size of the Y plane */
-                        width = (width + 1) / 2;
-                        height = (height + 1) / 2;
-                        src_pitch = (src_pitch + 1) / 2;
-                        dst_pitch = (dst_pitch + 1) / 2;
-                        for (i = height * 2; i--;) {
-                            SDL_memcpy(dst, src, width);
-                            src = (const Uint8*)src + src_pitch;
-                            dst = (Uint8*)dst + dst_pitch;
-                        }
-                    } else if (src_format == SDL_PIXELFORMAT_NV12 || src_format == SDL_PIXELFORMAT_NV21) {
-                        /* U/V plane is half the height of the Y plane */
-                        height = (height + 1) / 2;
-                        width = (width + 1) / 2;
-                        src_pitch = (src_pitch + 1) / 2;
-                        dst_pitch = (dst_pitch + 1) / 2;
-                        for (i = height; i--;) {
-                            SDL_memcpy(dst, src, 2 * width);
-                            src = (const Uint8*)src + 2 * src_pitch;
-                            dst = (Uint8*)dst + 2 * dst_pitch;
-                        }
-                    }
-#endif
-                }
-                break;
-            default:
-                return SDL_SetError("Unknown FOURCC pixel format");
-            }
-        } else {
-            const int bpp = SDL_BYTESPERPIXEL(src_format);
-            width *= bpp;
-            for (i = height; i--;) {
-                SDL_memcpy(dst, src, width);
-                src = (const Uint8*)src + src_pitch;
-                dst = (Uint8*)dst + dst_pitch;
-            }
+        const int bpp = SDL_BYTESPERPIXEL(src_format);
+        width *= bpp;
+        for (i = height; i--;) {
+            SDL_memcpy(dst, src, width);
+            src = (const Uint8*)src + src_pitch;
+            dst = (Uint8*)dst + dst_pitch;
        }
        return 0;
    }

-    /* FOURCC to Any */
-    if (SDL_ISPIXELFORMAT_FOURCC(src_format)) {
-        /* FOURCC to ARGB8888 */
-        if (dst_format == SDL_PIXELFORMAT_ARGB8888) {
-            SDL_ConvertPixels_YUV_to_ARGB8888(width, height, src_format, src, dst, dst_pitch);
-            return 0;
-        }
-        else /* FOURCC to not(ARGB8888) : need an intermediate conversion */
-        {
-            int ret;
-            void *tmp = SDL_malloc(width * height * 4);
-            if (tmp == NULL) {
-                return -1;
-            }
-
-            /* convert src/FOURCC to tmp/ARGB8888 */
-            SDL_ConvertPixels_YUV_to_ARGB8888(width, height, src_format, src, tmp, width * 4);
-            
-            /* convert tmp/ARGB8888 to dst/dst_format */
-            ret = SDL_ConvertPixels(width, height, SDL_PIXELFORMAT_ARGB8888, tmp, width * 4, dst_format, dst, dst_pitch);
-            SDL_free(tmp);
-            return ret;
-        }
-    }
-
-    /* Any to FOURCC */
-    if (SDL_ISPIXELFORMAT_FOURCC(dst_format)) {
-        /* ARGB8888 to FOURCC */
-        if (src_format == SDL_PIXELFORMAT_ARGB8888) {
-            SDL_ConvertPixels_ARGB8888_to_YUV(width, height, src, src_pitch, dst_format, dst);
-            return 0;
-        }
-        else /* not(ARGB8888) to FOURCC : need an intermediate conversion */
-        {
-            int ret;
-            void *tmp = SDL_malloc(width * height * 4);
-            if (tmp == NULL) {
-                return -1;
-            }
-            /* convert src/src_format to tmp/ARGB8888 */
-            ret = SDL_ConvertPixels(width, height, src_format, src, src_pitch, SDL_PIXELFORMAT_ARGB8888, tmp, width * 4);
-            if (ret == -1) {
-                SDL_free(tmp);
-                return ret;
-            }
-            /* convert tmp/ARGB8888 to dst/FOURCC */
-            SDL_ConvertPixels_ARGB8888_to_YUV(width, height, tmp, width * 4, dst_format, dst);
-
-            SDL_free(tmp);
-            return 0;
-        }
-    }
-
    if (!SDL_CreateSurfaceOnStack(width, height, src_format, nonconst_src,
                                  src_pitch,
                                  &src_surface, &src_fmt, &src_blitmap)) {
@@ -1322,491 +1229,4 @@ SDL_FreeSurface(SDL_Surface * surface)
    SDL_free(surface);
 }

-
-/* YUV-RGB conversion */
-#define CLAMP(val) ((val) > 0 ? ((val) < 255 ? (val) : 255) : 0)
-
-#if 1
-
-/* Coefficients from CCIR 601 */
-#define MAKE_Y(r, g, b) (int)( 0.29900f * (r) + 0.58700f * (g) + 0.11400f * (b))
-#define MAKE_U(r, g, b) (int)(-0.16874f * (r) - 0.33126f * (g) + 0.50000f * (b) + 128)
-#define MAKE_V(r, g, b) (int)( 0.50000f * (r) - 0.41869f * (g) - 0.08131f * (b) + 128)
-
-#define MAKE_R(y, u, v) CLAMP((int)((y)                          + 1.40200f * ((v) - 128)))
-#define MAKE_G(y, u, v) CLAMP((int)((y) - 0.34414f * ((u) - 128) - 0.71414f * ((v) - 128)))
-#define MAKE_B(y, u, v) CLAMP((int)((y) + 1.77200f * ((u) - 128)                         ))
-
-#else
-
-/* Coefficients from Video Demystified */
-#define MAKE_Y(r, g, b) (((  66 * (r) + 129 * (g) +  25 * (b) + 128) >> 8) + 16)
-#define MAKE_U(r, g, b) ((( -38 * (r) -  74 * (g) + 112 * (b) + 128) >> 8) + 128)
-#define MAKE_V(r, g, b) ((( 112 * (r) -  94 * (g) -  18 * (b) + 128) >> 8) + 128)
-
-#define MAKE_R(y, u, v) CLAMP(( 298 * ((y) - 16)                     + 409 * ((v) - 128) + 128) >> 8) 
-#define MAKE_G(y, u, v) CLAMP(( 298 * ((y) - 16) - 100 * ((u) - 128) - 208 * ((v) - 128) + 128) >> 8)
-#define MAKE_B(y, u, v) CLAMP(( 298 * ((y) - 16) + 516 * ((u) - 128)                     + 128) >> 8)
-
-#endif
-
-
-static int
-SDL_ConvertPixels_YUV_to_ARGB8888(int width, int height,
-         Uint32 src_format, const void *src, 
-         void *dst, int dst_pitch)
-{   
-    const int sz_plane         = width * height;
-    const int sz_plane_chroma  = ((width + 1) / 2) * ((height + 1) / 2);
-    const int width_remainder  = (width &  0x1);
-    const int width_half       = width / 2;
-    const int curr_row_padding = dst_pitch - 4 * width;
-    int i, j;
-    Uint8 *curr_row = (Uint8*)dst;
-
-    // SDL_Log("SDL_ConvertPixels_YUV_to_ARGB8888 (from %s)", SDL_GetPixelFormatName(src_format));
-
-#define WRITE_RGB_PIXEL(y, u, v)            \
-    *((Uint32*)curr_row) =                  \
-           (MAKE_B((y), (u), (v))           \
-         | (MAKE_G((y), (u), (v)) << 8)     \
-         | (MAKE_R((y), (u), (v)) << 16)    \
-         | 0xff000000);                     \
-    curr_row += 4;                          \
-
-    switch (src_format) 
-    {
-        case SDL_PIXELFORMAT_YV12:
-        case SDL_PIXELFORMAT_IYUV:
-        case SDL_PIXELFORMAT_NV12:
-        case SDL_PIXELFORMAT_NV21:
-            {
-                const Uint8 *plane_y = (const Uint8*)src;
-
-                if (src_format == SDL_PIXELFORMAT_YV12 || src_format == SDL_PIXELFORMAT_IYUV)
-                {
-                    const Uint8 *plane_u = (src_format == SDL_PIXELFORMAT_YV12 ? plane_y + sz_plane + sz_plane_chroma : plane_y + sz_plane);
-                    const Uint8 *plane_v = (src_format == SDL_PIXELFORMAT_YV12 ? plane_y + sz_plane : plane_y + sz_plane + sz_plane_chroma);
-
-                    for (j = 0; j < height; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            const Uint8 u = *plane_u++;
-                            const Uint8 v = *plane_v++;
-                            const Uint8 y = *plane_y++;
-                            const Uint8 y1 = *plane_y++;
-                            WRITE_RGB_PIXEL(y, u, v);
-                            WRITE_RGB_PIXEL(y1, u, v);
-                        }
-                        if (width_remainder) {
-                            const Uint8 u = *plane_u++;
-                            const Uint8 v = *plane_v++;
-                            const Uint8 y = *plane_y++;
-                            WRITE_RGB_PIXEL(y, u, v);
-                        }
-                        /* Re-use the same line of chroma planes */
-                        if ((j & 0x1) == 0x0) {
-                            plane_u -= width_half + width_remainder;
-                            plane_v -= width_half + width_remainder;
-                        }
-                        curr_row += curr_row_padding;
-                    }
-                }
-                else if (src_format == SDL_PIXELFORMAT_NV12)
-                {
-                    const Uint8 *plane_interleaved_uv = plane_y + sz_plane;
-                    for (j = 0; j < height; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            const Uint8 y = *plane_y++;
-                            const Uint8 y1 = *plane_y++;
-                            const Uint8 u = *plane_interleaved_uv++;
-                            const Uint8 v = *plane_interleaved_uv++;
-                            WRITE_RGB_PIXEL(y, u, v);
-                            WRITE_RGB_PIXEL(y1, u, v);
-                        }
-                        if (width_remainder) {
-                            const Uint8 y = *plane_y++;
-                            const Uint8 u = *plane_interleaved_uv++;
-                            const Uint8 v = *plane_interleaved_uv++;
-                            WRITE_RGB_PIXEL(y, u, v);
-                        }
-                        /* Re-use the same line of chroma planes */
-                        if ((j & 0x1) == 0x0) {
-                            plane_interleaved_uv -= 2 * (width_half + width_remainder);
-                        }
-                        curr_row += curr_row_padding;
-                    }
-                } 
-                else /* src_format == SDL_PIXELFORMAT_NV21 */
-                {
-                    const Uint8 *plane_interleaved_uv = plane_y + sz_plane;
-                    for (j = 0; j < height; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            const Uint8 y = *plane_y++;
-                            const Uint8 y1 = *plane_y++;
-                            const Uint8 v = *plane_interleaved_uv++;
-                            const Uint8 u = *plane_interleaved_uv++;
-                            WRITE_RGB_PIXEL(y, u, v);
-                            WRITE_RGB_PIXEL(y1, u, v);
-                        }
-                        if (width_remainder) {
-                            const Uint8 y = *plane_y++;
-                            const Uint8 v = *plane_interleaved_uv++;
-                            const Uint8 u = *plane_interleaved_uv++;
-                            WRITE_RGB_PIXEL(y, u, v);
-                        }
-                        /* Re-use the same line of chroma planes */
-                        if ((j & 0x1) == 0x0) {
-                            plane_interleaved_uv -= 2 * (width_half + width_remainder);
-                        }
-                        curr_row += curr_row_padding;
-                    }
-                }
-            }
-            break;
-
-        case SDL_PIXELFORMAT_YUY2:
-        case SDL_PIXELFORMAT_UYVY:
-        case SDL_PIXELFORMAT_YVYU:
-            {
-                const Uint8 *plane = (const Uint8 *)src;
-
-#define READ_PACKED_YUV(var1, var2, var3, var4) \
-                const Uint8 var1 = plane[0];    \
-                const Uint8 var2 = plane[1];    \
-                const Uint8 var3 = plane[2];    \
-                const Uint8 var4 = plane[3];    \
-                plane += 4;                     \
-
-                if (src_format == SDL_PIXELFORMAT_YUY2) /* Y U Y1 V */
-                {
-                    for (j = 0; j < height; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_PACKED_YUV(y, u, y1, v);
-                            WRITE_RGB_PIXEL(y, u, v);
-                            WRITE_RGB_PIXEL(y1, u, v);
-                        }
-                        if (width_remainder) {
-                            READ_PACKED_YUV(y, u, y1, v); 
-                            (void)y1; /* y1 unused */
-                            WRITE_RGB_PIXEL(y, u, v);
-                        }
-                        curr_row += curr_row_padding;
-                    }
-                } 
-                else if (src_format == SDL_PIXELFORMAT_UYVY) /* U Y V Y1 */
-                {
-                    for (j = 0; j < height; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_PACKED_YUV(u, y, v, y1);
-                            WRITE_RGB_PIXEL(y, u, v);
-                            WRITE_RGB_PIXEL(y1, u, v);
-                        }
-                        if (width_remainder) {
-                            READ_PACKED_YUV(u, y, v, y1);
-                            (void) y1; /* y1 unused */
-                            WRITE_RGB_PIXEL(y, u, v);
-                        }
-                        curr_row += curr_row_padding;
-                    }
-                }
-                else if (src_format == SDL_PIXELFORMAT_YVYU) /* Y V Y1 U */
-                {
-                    for (j = 0; j < height; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_PACKED_YUV(y, v, y1, u);
-                            WRITE_RGB_PIXEL(y, u, v);
-                            WRITE_RGB_PIXEL(y1, u, v);
-                        }
-                        if (width_remainder) {
-                            READ_PACKED_YUV(y, v, y1, u);
-                            (void) y1; /* y1 unused */
-                            WRITE_RGB_PIXEL(y, u, v);
-                        }
-                        curr_row += curr_row_padding;
-                    }
-                } 
-#undef READ_PACKED_YUV
-            }
-            break;
-    }
-#undef WRITE_RGB_PIXEL
-    return 0;
-}
-
-static int
-SDL_ConvertPixels_ARGB8888_to_YUV(int width, int height, const void *src, int src_pitch, Uint32 dst_format, void *dst)
-{
-    const int src_pitch_x_2    = src_pitch * 2;
-    const int sz_plane         = width * height;
-    const int sz_plane_chroma  = ((width + 1) / 2) * ((height + 1) / 2);
-    const int height_half      = height / 2;
-    const int height_remainder = (height &  0x1);
-    const int width_half       = width / 2;
-    const int width_remainder  = (width  &  0x1);
-    int i, j;
-    
-    // SDL_Log("SDL_ConvertPixels_ARGB8888_to_YUV (to %s)", SDL_GetPixelFormatName(dst_format));
-
-    switch (dst_format) 
-    {
-        case SDL_PIXELFORMAT_YV12:
-        case SDL_PIXELFORMAT_IYUV:
-        case SDL_PIXELFORMAT_NV12:
-        case SDL_PIXELFORMAT_NV21:
-            {
-                const Uint8 *curr_row, *next_row;
-                
-                Uint8 *plane_y = (Uint8*) dst;
-                Uint8 *plane_u = (dst_format == SDL_PIXELFORMAT_YV12 ? plane_y + sz_plane + sz_plane_chroma : plane_y + sz_plane);
-                Uint8 *plane_v = (dst_format == SDL_PIXELFORMAT_YV12 ? plane_y + sz_plane : plane_y + sz_plane + sz_plane_chroma);
-                Uint8 *plane_interleaved_uv = plane_y + sz_plane;
-
-                curr_row = (const Uint8*)src;
-
-                /* Write Y plane */
-                for (j = 0; j < height; j++) {
-                    for (i = 0; i < width; i++) {
-                        const Uint8 b = curr_row[4 * i + 0];
-                        const Uint8 g = curr_row[4 * i + 1];
-                        const Uint8 r = curr_row[4 * i + 2];
-                        *plane_y++ = MAKE_Y(r, g, b);
-                    }
-                    curr_row += src_pitch;
-                }
-
-                curr_row = (const Uint8*)src;
-                next_row = (const Uint8*)src;
-                next_row += src_pitch;
-
-#if 1
-/* slightly faster */
-#define READ_2x2_PIXELS                                                                                                 \
-                const Uint32 p1 = ((const Uint32 *)curr_row)[2 * i];                                                    \
-                const Uint32 p2 = ((const Uint32 *)curr_row)[2 * i + 1];                                                \
-                const Uint32 p3 = ((const Uint32 *)next_row)[2 * i];                                                    \
-                const Uint32 p4 = ((const Uint32 *)next_row)[2 * i + 1];                                                \
-                const Uint32 b = ((p1 & 0x000000ff) + (p2 & 0x000000ff) + (p3 & 0x000000ff) + (p4 & 0x000000ff)) >> 2;  \
-                const Uint32 g = ((p1 & 0x0000ff00) + (p2 & 0x0000ff00) + (p3 & 0x0000ff00) + (p4 & 0x0000ff00)) >> 10; \
-                const Uint32 r = ((p1 & 0x00ff0000) + (p2 & 0x00ff0000) + (p3 & 0x00ff0000) + (p4 & 0x00ff0000)) >> 18; \
-
-#else
-
-#define READ_2x2_PIXELS                                                             \
-                const Uint8 b = (curr_row[8 * i + 0] + curr_row[8 * i + 4]          \
-                               + next_row[8 * i + 0] + next_row[8 * i + 4] ) >> 2;  \
-                const Uint8 g = (curr_row[8 * i + 1] + curr_row[8 * i + 5]          \
-                               + next_row[8 * i + 1] + next_row[8 * i + 5] ) >> 2;  \
-                const Uint8 r = (curr_row[8 * i + 2] + curr_row[8 * i + 6]          \
-                               + next_row[8 * i + 2] + next_row[8 * i + 6] ) >> 2;  \
-
-#endif
-
-#define READ_2x1_PIXELS                                                             \
-                const Uint8 b = (curr_row[8 * i + 0] + next_row[8 * i + 0]) >> 1;   \
-                const Uint8 g = (curr_row[8 * i + 1] + next_row[8 * i + 1]) >> 1;   \
-                const Uint8 r = (curr_row[8 * i + 2] + next_row[8 * i + 2]) >> 1;   \
-
-#define READ_1x2_PIXELS                                                             \
-                const Uint8 b = (curr_row[8 * i + 0] + curr_row[8 * i + 4]) >> 1;   \
-                const Uint8 g = (curr_row[8 * i + 1] + curr_row[8 * i + 5]) >> 1;   \
-                const Uint8 r = (curr_row[8 * i + 2] + curr_row[8 * i + 6]) >> 1;   \
-
-#define READ_1x1_PIXEL                                                              \
-                const Uint8 b = curr_row[8 * i + 0];                                \
-                const Uint8 g = curr_row[8 * i + 1];                                \
-                const Uint8 r = curr_row[8 * i + 2];                                \
-                
-                if (dst_format == SDL_PIXELFORMAT_YV12 || dst_format == SDL_PIXELFORMAT_IYUV)
-                {
-                    /* Write UV planes, not interleaved */
-                    for (j = 0; j < height_half; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_2x2_PIXELS;
-                            *plane_u++ = MAKE_U(r, g, b);
-                            *plane_v++ = MAKE_V(r, g, b);
-                        }
-                        if (width_remainder) {
-                            READ_2x1_PIXELS;
-                            *plane_u++ = MAKE_U(r, g, b);
-                            *plane_v++ = MAKE_V(r, g, b);
-                        }
-                        curr_row += src_pitch_x_2;
-                        next_row += src_pitch_x_2;
-                    }
-                    if (height_remainder) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_1x2_PIXELS;
-                            *plane_u++ = MAKE_U(r, g, b);
-                            *plane_v++ = MAKE_V(r, g, b);
-                        }
-                        if (width_remainder) {
-                            READ_1x1_PIXEL;
-                            *plane_u++ = MAKE_U(r, g, b);
-                            *plane_v++ = MAKE_V(r, g, b);
-                        }
-                    }
-                }
-                else if (dst_format == SDL_PIXELFORMAT_NV12)
-                {
-                    for (j = 0; j < height_half; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_2x2_PIXELS;
-                            *plane_interleaved_uv++ = MAKE_U(r, g, b);
-                            *plane_interleaved_uv++ = MAKE_V(r, g, b);
-                        }
-                        if (width_remainder) {
-                            READ_2x1_PIXELS;
-                            *plane_interleaved_uv++ = MAKE_U(r, g, b);
-                            *plane_interleaved_uv++ = MAKE_V(r, g, b);
-                        }
-                        curr_row += src_pitch_x_2;
-                        next_row += src_pitch_x_2;
-                    }
-                    if (height_remainder) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_1x2_PIXELS;
-                            *plane_interleaved_uv++ = MAKE_U(r, g, b);
-                            *plane_interleaved_uv++ = MAKE_V(r, g, b);
-                        }
-                        if (width_remainder) {
-                            READ_1x1_PIXEL;
-                            *plane_interleaved_uv++ = MAKE_U(r, g, b);
-                            *plane_interleaved_uv++ = MAKE_V(r, g, b);
-                        }
-                    }
-                } 
-                else /* dst_format == SDL_PIXELFORMAT_NV21 */
-                {
-                    for (j = 0; j < height_half; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_2x2_PIXELS;
-                            *plane_interleaved_uv++ = MAKE_V(r, g, b);
-                            *plane_interleaved_uv++ = MAKE_U(r, g, b);
-                        }
-                        if (width_remainder) {
-                            READ_2x1_PIXELS;
-                            *plane_interleaved_uv++ = MAKE_V(r, g, b);
-                            *plane_interleaved_uv++ = MAKE_U(r, g, b);
-                        }
-                        curr_row += src_pitch_x_2;
-                        next_row += src_pitch_x_2;
-                    }
-                    if (height_remainder) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_1x2_PIXELS;
-                            *plane_interleaved_uv++ = MAKE_V(r, g, b);
-                            *plane_interleaved_uv++ = MAKE_U(r, g, b);
-                        }
-                        if (width_remainder) {
-                            READ_1x1_PIXEL;
-                            *plane_interleaved_uv++ = MAKE_V(r, g, b);
-                            *plane_interleaved_uv++ = MAKE_U(r, g, b);
-                        }
-                    }
-                }
-#undef READ_2x2_PIXELS
-#undef READ_2x1_PIXELS
-#undef READ_1x2_PIXELS
-#undef READ_1x1_PIXEL
-            }
-            break;
-
-        case SDL_PIXELFORMAT_YUY2:
-        case SDL_PIXELFORMAT_UYVY:
-        case SDL_PIXELFORMAT_YVYU:
-            {
-                const Uint8 *curr_row = (const Uint8*) src;
-                Uint8 *plane           = (Uint8*) dst;
-
-#define READ_TWO_RGB_PIXELS \
-                const Uint8 b = curr_row[8 * i + 0];    \
-                const Uint8 g = curr_row[8 * i + 1];    \
-                const Uint8 r = curr_row[8 * i + 2];    \
-                const Uint8 b1 = curr_row[8 * i + 4];   \
-                const Uint8 g1 = curr_row[8 * i + 5];   \
-                const Uint8 r1 = curr_row[8 * i + 6];   \
-                const Uint8 B = (b + b1) >> 1;          \
-                const Uint8 G = (g + g1) >> 1;          \
-                const Uint8 R = (r + r1) >> 1;          \
-
-#define READ_ONE_RGB_PIXEL \
-                const Uint8 b = curr_row[8 * i + 0];    \
-                const Uint8 g = curr_row[8 * i + 1];    \
-                const Uint8 r = curr_row[8 * i + 2];    \
-
-                /* Write YUV plane, packed */
-                if (dst_format == SDL_PIXELFORMAT_YUY2) 
-                {
-                    for (j = 0; j < height; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_TWO_RGB_PIXELS;
-                            /* Y U Y1 V */
-                            *plane++ = MAKE_Y(r, g, b);
-                            *plane++ = MAKE_U(R, G, B);
-                            *plane++ = MAKE_Y(r1, g1, b1);
-                            *plane++ = MAKE_V(R, G, B);
-                        }
-                        if (width_remainder) {
-                            READ_ONE_RGB_PIXEL;
-                            /* Y U Y V */
-                            *plane++ = MAKE_Y(r, g, b);
-                            *plane++ = MAKE_U(r, g, b);
-                            *plane++ = MAKE_Y(r, g, b);
-                            *plane++ = MAKE_V(r, g, b);
-                        }
-                        curr_row += src_pitch;
-                    }
-                } 
-                else if (dst_format == SDL_PIXELFORMAT_UYVY)
-                {
-                    for (j = 0; j < height; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_TWO_RGB_PIXELS;
-                            /* U Y V Y1 */
-                            *plane++ = MAKE_U(R, G, B);
-                            *plane++ = MAKE_Y(r, g, b);
-                            *plane++ = MAKE_V(R, G, B);
-                            *plane++ = MAKE_Y(r1, g1, b1);
-                        }
-                        if (width_remainder) {
-                            READ_ONE_RGB_PIXEL;
-                            /* U Y V Y */
-                            *plane++ = MAKE_U(r, g, b);
-                            *plane++ = MAKE_Y(r, g, b);
-                            *plane++ = MAKE_V(r, g, b);
-                            *plane++ = MAKE_Y(r, g, b);
-                        }
-                        curr_row += src_pitch;
-                    }
-                }
-                else if (dst_format == SDL_PIXELFORMAT_YVYU)
-                {
-                    for (j = 0; j < height; j++) {
-                        for (i = 0; i < width_half; i++) {
-                            READ_TWO_RGB_PIXELS;
-                            /* Y V Y1 U */
-                            *plane++ = MAKE_Y(r, g, b);
-                            *plane++ = MAKE_V(R, G, B);
-                            *plane++ = MAKE_Y(r1, g1, b1);
-                            *plane++ = MAKE_U(R, G, B);
-                        }
-                        if (width_remainder) {
-                            READ_ONE_RGB_PIXEL;
-                            /* Y V Y U */
-                            *plane++ = MAKE_Y(r, g, b);
-                            *plane++ = MAKE_V(r, g, b);
-                            *plane++ = MAKE_Y(r, g, b);
-                            *plane++ = MAKE_U(r, g, b);
-                        }
-                        curr_row += src_pitch;
-                    }
-                }
-#undef READ_TWO_RGB_PIXELS
-#undef READ_ONE_RGB_PIXEL
-            }
-            break;
-    }
-    return 0;
-}
-
 /* vi: set ts=4 sw=4 expandtab: */
--- a/src/video/SDL_yuv.c
+++ b/src/video/SDL_yuv.c
--- a/src/video/SDL_yuv_c.h
+++ b/src/video/SDL_yuv_c.h
@@ -0,0 +1,30 @@
+/*
+  Simple DirectMedia Layer
+  Copyright (C) 1997-2017 Sam Lantinga <slouken@libsdl.org>
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+#include "../SDL_internal.h"
+
+
+/* YUV conversion functions */
+
+extern int SDL_ConvertPixels_YUV_to_RGB(int width, int height, Uint32 src_format, const void *src, int src_pitch, Uint32 dst_format, void *dst, int dst_pitch);
+extern int SDL_ConvertPixels_RGB_to_YUV(int width, int height, Uint32 src_format, const void *src, int src_pitch, Uint32 dst_format, void *dst, int dst_pitch);
+extern int SDL_ConvertPixels_YUV_to_YUV(int width, int height, Uint32 src_format, const void *src, int src_pitch, Uint32 dst_format, void *dst, int dst_pitch);
+
+/* vi: set ts=4 sw=4 expandtab: */
--- a/src/video/yuv2rgb/LICENSE
+++ b/src/video/yuv2rgb/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2016, Adrien Descamps
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of yuv2rgb nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/src/video/yuv2rgb/README.md
+++ b/src/video/yuv2rgb/README.md
@@ -0,0 +1,63 @@
+From: https://github.com/descampsa/yuv2rgb
+# yuv2rgb
+C library for fast image conversion between yuv420p and rgb24.
+
+This is a simple library for optimized image conversion between YUV420p and rgb24.
+It was done mainly as an exercise to learn to use sse instrinsics, so there may still be room for optimization.
+
+For each conversion, a standard c optimized function and two sse function (with aligned and unaligned memory) are implemented.
+The sse version requires only SSE2, which is available on any reasonnably recent CPU.
+The library also supports the three different YUV (YCrCb to be correct) color spaces that exist (see comments in code), and others can be added simply.
+
+There is a simple test program, that convert a raw YUV file to rgb ppm format, and measure computation time.
+Optionnaly, it also compares the result and computation time with the ffmpeg implementation (that uses MMX), and with the IPP functions.
+
+To compile, simply do :
+
+    mkdir build
+    cd build
+    cmake -DCMAKE_BUILD_TYPE=Release ..
+    make
+
+The test program only support raw YUV files for the YUV420 format, and ppm for the RGB24 format.
+To generate a raw yuv file, you can use avconv:
+
+    avconv -i example.jpg -c:v rawvideo -pix_fmt yuv420p example.yuv
+
+To generate the rgb file, you can use the ImageMagick convert program:
+
+    convert example.jpg example.ppm
+
+Then, for YUV420 to RGB24 conversion, use the test program like that:
+
+    ./test_yuv_rgb yuv2rgb image.yuv 4096 2160 image
+  
+The second and third parameters are image width and height (that are needed because not available in the raw YUV file), and fourth parameter is the output filename template (several output files will be generated, named for example output_sse.ppm, output_av.ppm, etc.)
+
+Similarly, for RGB24 to YUV420 conversion:
+
+    ./test_yuv_rgb yuv2rgb image.ppm image
+
+On my computer, the test program on a 4K image give the following for yuv2rgb:
+
+    Time will be measured in each configuration for 100 iterations...
+    Processing time (std) : 2.630193 sec
+    Processing time (sse2_unaligned) : 0.704394 sec
+    Processing time (ffmpeg_unaligned) : 1.221432 sec
+    Processing time (ipp_unaligned) : 0.636274 sec
+    Processing time (sse2_aligned) : 0.606648 sec
+    Processing time (ffmpeg_aligned) : 1.227100 sec
+    Processing time (ipp_aligned) : 0.636951 sec
+
+And for rgb2yuv:
+
+    Time will be measured in each configuration for 100 iterations...
+    Processing time (std) : 2.588675 sec
+    Processing time (sse2_unaligned) : 0.676625 sec
+    Processing time (ffmpeg_unaligned) : 3.385816 sec
+    Processing time (ipp_unaligned) : 0.593890 sec
+    Processing time (sse2_aligned) : 0.640630 sec
+    Processing time (ffmpeg_aligned) : 3.397952 sec
+    Processing time (ipp_aligned) : 0.579043 sec
+
+configuration : gcc 4.9.2, swscale 3.0.0, IPP 9.0.1, intel i7-5500U
--- a/src/video/yuv2rgb/yuv_rgb.c
+++ b/src/video/yuv2rgb/yuv_rgb.c
@@ -0,0 +1,687 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+#include "../../SDL_internal.h"
+
+#include "yuv_rgb.h"
+
+#include "SDL_cpuinfo.h"
+/*#include <x86intrin.h>*/
+
+#define PRECISION 6
+#define PRECISION_FACTOR (1<<PRECISION)
+
+typedef struct
+{
+	uint8_t y_shift;
+	int16_t matrix[3][3];
+} RGB2YUVParam;
+// |Y|   |y_shift|                        |matrix[0][0] matrix[0][1] matrix[0][2]|   |R|
+// |U| = |  128  | + 1/PRECISION_FACTOR * |matrix[1][0] matrix[1][1] matrix[1][2]| * |G|
+// |V|   |  128  |                        |matrix[2][0] matrix[2][1] matrix[2][2]|   |B|
+
+typedef struct
+{
+	uint8_t y_shift;
+	int16_t y_factor;
+	int16_t v_r_factor;
+	int16_t u_g_factor;
+	int16_t v_g_factor;
+	int16_t u_b_factor;
+} YUV2RGBParam;
+// |R|                        |y_factor      0       v_r_factor|   |Y-y_shift|
+// |G| = 1/PRECISION_FACTOR * |y_factor  u_g_factor  v_g_factor| * |  U-128  |
+// |B|                        |y_factor  u_b_factor      0     |   |  V-128  |
+
+#define V(value) (int16_t)((value*PRECISION_FACTOR)+0.5)
+
+// for ITU-T T.871, values can be found in section 7
+// for ITU-R BT.601-7 values are derived from equations in sections 2.5.1-2.5.3, assuming RGB is encoded using full range ([0-1]<->[0-255])
+// for ITU-R BT.709-6 values are derived from equations in sections 3.2-3.4, assuming RGB is encoded using full range ([0-1]<->[0-255])
+// all values are rounded to the fourth decimal
+
+static const YUV2RGBParam YUV2RGB[3] = {
+	// ITU-T T.871 (JPEG)
+	{/*.y_shift=*/ 0, /*.y_factor=*/ V(1.0), /*.v_r_factor=*/ V(1.402), /*.u_g_factor=*/ -V(0.3441), /*.v_g_factor=*/ -V(0.7141), /*.u_b_factor=*/ V(1.772)},
+	// ITU-R BT.601-7
+	{/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.596), /*.u_g_factor=*/ -V(0.3918), /*.v_g_factor=*/ -V(0.813), /*.u_b_factor=*/ V(2.0172)},
+	// ITU-R BT.709-6
+	{/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.7927), /*.u_g_factor=*/ -V(0.2132), /*.v_g_factor=*/ -V(0.5329), /*.u_b_factor=*/ V(2.1124)}
+};
+
+static const RGB2YUVParam RGB2YUV[3] = {
+	// ITU-T T.871 (JPEG)
+	{/*.y_shift=*/ 0, /*.matrix=*/ {{V(0.299), V(0.587), V(0.114)}, {-V(0.1687), -V(0.3313), V(0.5)}, {V(0.5), -V(0.4187), -V(0.0813)}}},
+	// ITU-R BT.601-7
+	{/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.2568), V(0.5041), V(0.0979)}, {-V(0.1482), -V(0.291), V(0.4392)}, {V(0.4392), -V(0.3678), -V(0.0714)}}},
+	// ITU-R BT.709-6
+	{/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.1826), V(0.6142), V(0.062)}, {-V(0.1006), -V(0.3386), V(0.4392)}, {V(0.4392), -V(0.3989), -V(0.0403)}}}
+};
+
+/* The various layouts of YUV data we support */
+#define YUV_FORMAT_420	1
+#define YUV_FORMAT_422	2
+#define YUV_FORMAT_NV12	3
+
+/* The various formats of RGB pixel that we support */
+#define RGB_FORMAT_RGB565	1
+#define RGB_FORMAT_RGB24	2
+#define RGB_FORMAT_RGBA		3
+#define RGB_FORMAT_BGRA		4
+#define RGB_FORMAT_ARGB		5
+#define RGB_FORMAT_ABGR		6
+
+// divide by PRECISION_FACTOR and clamp to [0:255] interval
+// input must be in the [-128*PRECISION_FACTOR:384*PRECISION_FACTOR] range
+static uint8_t clampU8(int32_t v)
+{
+	static const uint8_t lut[512] = 
+	{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
+	47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,
+	91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
+	126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,
+	159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
+	192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+	225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,
+	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
+	};
+	return lut[(v+128*PRECISION_FACTOR)>>PRECISION];
+}
+
+
+#define STD_FUNCTION_NAME	yuv420_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv420_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv420_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv420_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv420_argb_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv420_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_argb_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_argb_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_std_func.h"
+
+void rgb24_yuv420_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *RGB, uint32_t RGB_stride, 
+	uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
+	YCbCrType yuv_type)
+{
+	const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
+	
+	uint32_t x, y;
+	for(y=0; y<(height-1); y+=2)
+	{
+		const uint8_t *rgb_ptr1=RGB+y*RGB_stride,
+			*rgb_ptr2=RGB+(y+1)*RGB_stride;
+			
+		uint8_t *y_ptr1=Y+y*Y_stride,
+			*y_ptr2=Y+(y+1)*Y_stride,
+			*u_ptr=U+(y/2)*UV_stride,
+			*v_ptr=V+(y/2)*UV_stride;
+		
+		for(x=0; x<(width-1); x+=2)
+		{
+			// compute yuv for the four pixels, u and v values are summed
+			int32_t y_tmp, u_tmp, v_tmp;
+			
+			y_tmp = param->matrix[0][0]*rgb_ptr1[0] + param->matrix[0][1]*rgb_ptr1[1] + param->matrix[0][2]*rgb_ptr1[2];
+			u_tmp = param->matrix[1][0]*rgb_ptr1[0] + param->matrix[1][1]*rgb_ptr1[1] + param->matrix[1][2]*rgb_ptr1[2];
+			v_tmp = param->matrix[2][0]*rgb_ptr1[0] + param->matrix[2][1]*rgb_ptr1[1] + param->matrix[2][2]*rgb_ptr1[2];
+			y_ptr1[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+			
+			y_tmp = param->matrix[0][0]*rgb_ptr1[3] + param->matrix[0][1]*rgb_ptr1[4] + param->matrix[0][2]*rgb_ptr1[5];
+			u_tmp += param->matrix[1][0]*rgb_ptr1[3] + param->matrix[1][1]*rgb_ptr1[4] + param->matrix[1][2]*rgb_ptr1[5];
+			v_tmp += param->matrix[2][0]*rgb_ptr1[3] + param->matrix[2][1]*rgb_ptr1[4] + param->matrix[2][2]*rgb_ptr1[5];
+			y_ptr1[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+			
+			y_tmp = param->matrix[0][0]*rgb_ptr2[0] + param->matrix[0][1]*rgb_ptr2[1] + param->matrix[0][2]*rgb_ptr2[2];
+			u_tmp += param->matrix[1][0]*rgb_ptr2[0] + param->matrix[1][1]*rgb_ptr2[1] + param->matrix[1][2]*rgb_ptr2[2];
+			v_tmp += param->matrix[2][0]*rgb_ptr2[0] + param->matrix[2][1]*rgb_ptr2[1] + param->matrix[2][2]*rgb_ptr2[2];
+			y_ptr2[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+			
+			y_tmp = param->matrix[0][0]*rgb_ptr2[3] + param->matrix[0][1]*rgb_ptr2[4] + param->matrix[0][2]*rgb_ptr2[5];
+			u_tmp += param->matrix[1][0]*rgb_ptr2[3] + param->matrix[1][1]*rgb_ptr2[4] + param->matrix[1][2]*rgb_ptr2[5];
+			v_tmp += param->matrix[2][0]*rgb_ptr2[3] + param->matrix[2][1]*rgb_ptr2[4] + param->matrix[2][2]*rgb_ptr2[5];
+			y_ptr2[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+			
+			u_ptr[0] = clampU8(u_tmp/4+(128<<PRECISION));
+			v_ptr[0] = clampU8(v_tmp/4+(128<<PRECISION));
+			
+			rgb_ptr1 += 6;
+			rgb_ptr2 += 6;
+			y_ptr1 += 2;
+			y_ptr2 += 2;
+			u_ptr += 1;
+			v_ptr += 1;
+		}
+	}
+}
+
+#ifdef __SSE2__
+
+#define SSE_FUNCTION_NAME	yuv420_rgb565_sse
+#define STD_FUNCTION_NAME	yuv420_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_rgb565_sseu
+#define STD_FUNCTION_NAME	yuv420_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_rgb24_sse
+#define STD_FUNCTION_NAME	yuv420_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_rgb24_sseu
+#define STD_FUNCTION_NAME	yuv420_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_rgba_sse
+#define STD_FUNCTION_NAME	yuv420_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_rgba_sseu
+#define STD_FUNCTION_NAME	yuv420_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_bgra_sse
+#define STD_FUNCTION_NAME	yuv420_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_bgra_sseu
+#define STD_FUNCTION_NAME	yuv420_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_argb_sse
+#define STD_FUNCTION_NAME	yuv420_argb_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_argb_sseu
+#define STD_FUNCTION_NAME	yuv420_argb_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_abgr_sse
+#define STD_FUNCTION_NAME	yuv420_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_abgr_sseu
+#define STD_FUNCTION_NAME	yuv420_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgb565_sse
+#define STD_FUNCTION_NAME	yuv422_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgb565_sseu
+#define STD_FUNCTION_NAME	yuv422_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgb24_sse
+#define STD_FUNCTION_NAME	yuv422_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgb24_sseu
+#define STD_FUNCTION_NAME	yuv422_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgba_sse
+#define STD_FUNCTION_NAME	yuv422_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgba_sseu
+#define STD_FUNCTION_NAME	yuv422_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_bgra_sse
+#define STD_FUNCTION_NAME	yuv422_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_bgra_sseu
+#define STD_FUNCTION_NAME	yuv422_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_argb_sse
+#define STD_FUNCTION_NAME	yuv422_argb_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_argb_sseu
+#define STD_FUNCTION_NAME	yuv422_argb_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_abgr_sse
+#define STD_FUNCTION_NAME	yuv422_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_abgr_sseu
+#define STD_FUNCTION_NAME	yuv422_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgb565_sse
+#define STD_FUNCTION_NAME	yuvnv12_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgb565_sseu
+#define STD_FUNCTION_NAME	yuvnv12_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgb24_sse
+#define STD_FUNCTION_NAME	yuvnv12_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgb24_sseu
+#define STD_FUNCTION_NAME	yuvnv12_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgba_sse
+#define STD_FUNCTION_NAME	yuvnv12_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgba_sseu
+#define STD_FUNCTION_NAME	yuvnv12_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_bgra_sse
+#define STD_FUNCTION_NAME	yuvnv12_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_bgra_sseu
+#define STD_FUNCTION_NAME	yuvnv12_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_argb_sse
+#define STD_FUNCTION_NAME	yuvnv12_argb_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_argb_sseu
+#define STD_FUNCTION_NAME	yuvnv12_argb_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_abgr_sse
+#define STD_FUNCTION_NAME	yuvnv12_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_abgr_sseu
+#define STD_FUNCTION_NAME	yuvnv12_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_sse_func.h"
+
+
+#define UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+R1 = _mm_unpacklo_epi8(RGB1, RGB4); \
+R2 = _mm_unpackhi_epi8(RGB1, RGB4); \
+G1 = _mm_unpacklo_epi8(RGB2, RGB5); \
+G2 = _mm_unpackhi_epi8(RGB2, RGB5); \
+B1 = _mm_unpacklo_epi8(RGB3, RGB6); \
+B2 = _mm_unpackhi_epi8(RGB3, RGB6);
+
+#define UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+RGB1 = _mm_unpacklo_epi8(R1, G2); \
+RGB2 = _mm_unpackhi_epi8(R1, G2); \
+RGB3 = _mm_unpacklo_epi8(R2, B1); \
+RGB4 = _mm_unpackhi_epi8(R2, B1); \
+RGB5 = _mm_unpacklo_epi8(G1, B2); \
+RGB6 = _mm_unpackhi_epi8(G1, B2); \
+
+#define UNPACK_RGB24_32(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+
+#define RGB2YUV_16(R, G, B, Y, U, V) \
+Y = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[0][0])), \
+		_mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[0][1]))); \
+Y = _mm_add_epi16(Y, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[0][2]))); \
+Y = _mm_add_epi16(Y, _mm_set1_epi16((param->y_shift)<<PRECISION)); \
+Y = _mm_srai_epi16(Y, PRECISION); \
+U = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[1][0])), \
+		_mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[1][1]))); \
+U = _mm_add_epi16(U, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[1][2]))); \
+U = _mm_add_epi16(U, _mm_set1_epi16(128<<PRECISION)); \
+U = _mm_srai_epi16(U, PRECISION); \
+V = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[2][0])), \
+		_mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[2][1]))); \
+V = _mm_add_epi16(V, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[2][2]))); \
+V = _mm_add_epi16(V, _mm_set1_epi16(128<<PRECISION)); \
+V = _mm_srai_epi16(V, PRECISION);
+
+#define RGB2YUV_32 \
+	__m128i r1, r2, b1, b2, g1, g2; \
+	__m128i r_16, g_16, b_16; \
+	__m128i y1_16, y2_16, u1_16, u2_16, v1_16, v2_16, y, u1, u2, v1, v2, u1_tmp, u2_tmp, v1_tmp, v2_tmp; \
+	__m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \
+		rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \
+		rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \
+		rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \
+		rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \
+		rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)); \
+	/* unpack rgb24 data to r, g and b data in separate channels*/ \
+	UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
+	/* process pixels of first line */ \
+	r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
+	g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
+	b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+	r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
+	g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
+	b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+	y = _mm_packus_epi16(y1_16, y2_16); \
+	u1 = _mm_packus_epi16(u1_16, u2_16); \
+	v1 = _mm_packus_epi16(v1_16, v2_16); \
+	/* save Y values */ \
+	SAVE_SI128((__m128i*)(y_ptr1), y); \
+	/* process pixels of second line */ \
+	r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
+	g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
+	b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+	r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
+	g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
+	b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+	y = _mm_packus_epi16(y1_16, y2_16); \
+	u2 = _mm_packus_epi16(u1_16, u2_16); \
+	v2 = _mm_packus_epi16(v1_16, v2_16); \
+	/* save Y values */ \
+	SAVE_SI128((__m128i*)(y_ptr2), y); \
+	/* vertical subsampling of u/v values */ \
+	u1_tmp = _mm_avg_epu8(u1, u2); \
+	v1_tmp = _mm_avg_epu8(v1, v2); \
+	/* do the same again with next data */ \
+	rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)); \
+	rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)); \
+	rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)); \
+	rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)); \
+	rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)); \
+	rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)); \
+	/* unpack rgb24 data to r, g and b data in separate channels*/ \
+	UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
+	/* process pixels of first line */ \
+	r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
+	g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
+	b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+	r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
+	g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
+	b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+	y = _mm_packus_epi16(y1_16, y2_16); \
+	u1 = _mm_packus_epi16(u1_16, u2_16); \
+	v1 = _mm_packus_epi16(v1_16, v2_16); \
+	/* save Y values */ \
+	SAVE_SI128((__m128i*)(y_ptr1+16), y); \
+	/* process pixels of second line */ \
+	r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
+	g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
+	b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+	r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
+	g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
+	b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+	y = _mm_packus_epi16(y1_16, y2_16); \
+	u2 = _mm_packus_epi16(u1_16, u2_16); \
+	v2 = _mm_packus_epi16(v1_16, v2_16); \
+	/* save Y values */ \
+	SAVE_SI128((__m128i*)(y_ptr2+16), y); \
+	/* vertical subsampling of u/v values */ \
+	u2_tmp = _mm_avg_epu8(u1, u2); \
+	v2_tmp = _mm_avg_epu8(v1, v2); \
+	/* horizontal subsampling of u/v values */ \
+	u1 = _mm_packus_epi16(_mm_srl_epi16(u1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(u2_tmp, _mm_cvtsi32_si128(8))); \
+	v1 = _mm_packus_epi16(_mm_srl_epi16(v1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(v2_tmp, _mm_cvtsi32_si128(8))); \
+	u2 = _mm_packus_epi16(_mm_and_si128(u1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(u2_tmp, _mm_set1_epi16(0xFF))); \
+	v2 = _mm_packus_epi16(_mm_and_si128(v1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(v2_tmp, _mm_set1_epi16(0xFF))); \
+	u1 = _mm_avg_epu8(u1, u2); \
+	v1 = _mm_avg_epu8(v1, v2); \
+	SAVE_SI128((__m128i*)(u_ptr), u1); \
+	SAVE_SI128((__m128i*)(v_ptr), v1);
+
+void rgb24_yuv420_sse(uint32_t width, uint32_t height, 
+	const uint8_t *RGB, uint32_t RGB_stride, 
+	uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
+	YCbCrType yuv_type)
+{
+	#define LOAD_SI128 _mm_load_si128
+	#define SAVE_SI128 _mm_stream_si128
+	const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
+	
+	uint32_t x, y;
+	for(y=0; y<(height-1); y+=2)
+	{
+		const uint8_t *rgb_ptr1=RGB+y*RGB_stride,
+			*rgb_ptr2=RGB+(y+1)*RGB_stride;
+		
+		uint8_t *y_ptr1=Y+y*Y_stride,
+			*y_ptr2=Y+(y+1)*Y_stride,
+			*u_ptr=U+(y/2)*UV_stride,
+			*v_ptr=V+(y/2)*UV_stride;
+		
+		for(x=0; x<(width-31); x+=32)
+		{
+			RGB2YUV_32
+			
+			rgb_ptr1+=96;
+			rgb_ptr2+=96;
+			y_ptr1+=32;
+			y_ptr2+=32;
+			u_ptr+=16; 
+			v_ptr+=16;
+		}
+	}
+	#undef LOAD_SI128
+	#undef SAVE_SI128
+}
+
+void rgb24_yuv420_sseu(uint32_t width, uint32_t height, 
+	const uint8_t *RGB, uint32_t RGB_stride, 
+	uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
+	YCbCrType yuv_type)
+{
+	#define LOAD_SI128 _mm_loadu_si128
+	#define SAVE_SI128 _mm_storeu_si128
+	const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
+	
+	uint32_t x, y;
+	for(y=0; y<(height-1); y+=2)
+	{
+		const uint8_t *rgb_ptr1=RGB+y*RGB_stride,
+			*rgb_ptr2=RGB+(y+1)*RGB_stride;
+		
+		uint8_t *y_ptr1=Y+y*Y_stride,
+			*y_ptr2=Y+(y+1)*Y_stride,
+			*u_ptr=U+(y/2)*UV_stride,
+			*v_ptr=V+(y/2)*UV_stride;
+		
+		for(x=0; x<(width-31); x+=32)
+		{
+			RGB2YUV_32
+			
+			rgb_ptr1+=96;
+			rgb_ptr2+=96;
+			y_ptr1+=32;
+			y_ptr2+=32;
+			u_ptr+=16; 
+			v_ptr+=16;
+		}
+	}
+	#undef LOAD_SI128
+	#undef SAVE_SI128
+}
+
+
+#endif //__SSE2__
+
--- a/src/video/yuv2rgb/yuv_rgb.h
+++ b/src/video/yuv2rgb/yuv_rgb.h
@@ -0,0 +1,380 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+
+// Provide optimized functions to convert images from 8bits yuv420 to rgb24 format
+
+// There are a few slightly different variations of the YCbCr color space with different parameters that 
+// change the conversion matrix.
+// The three most common YCbCr color space, defined by BT.601, BT.709 and JPEG standard are implemented here.
+// See the respective standards for details
+// The matrix values used are derived from http://www.equasys.de/colorconversion.html
+
+// YUV420 is stored as three separate channels, with U and V (Cb and Cr) subsampled by a 2 factor
+// For conversion from yuv to rgb, no interpolation is done, and the same UV value are used for 4 rgb pixels. This 
+// is suboptimal for image quality, but by far the fastest method.
+
+// For all methods, width and height should be even, if not, the last row/column of the result image won't be affected.
+// For sse methods, if the width if not divisable by 32, the last (width%32) pixels of each line won't be affected.
+
+#include <stdint.h>
+
+typedef enum
+{
+	YCBCR_JPEG,
+	YCBCR_601,
+	YCBCR_709
+} YCbCrType;
+
+// yuv to rgb, standard c implementation
+void yuv420_rgb565_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgb24_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgba_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_bgra_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_argb_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_abgr_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb565_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb24_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgba_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_bgra_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_argb_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_abgr_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb565_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb24_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgba_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_bgra_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_argb_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_abgr_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+// yuv to rgb, sse implementation
+// pointers must be 16 byte aligned, and strides must be divisable by 16
+void yuv420_rgb565_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgb24_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgba_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_bgra_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_argb_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_abgr_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb565_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb24_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgba_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_bgra_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_argb_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_abgr_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb565_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb24_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgba_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_bgra_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_argb_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_abgr_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+// yuv to rgb, sse implementation
+// pointers do not need to be 16 byte aligned
+void yuv420_rgb565_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgb24_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgba_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_bgra_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_argb_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_abgr_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb565_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb24_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgba_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_bgra_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_argb_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_abgr_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb565_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb24_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgba_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_bgra_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_argb_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_abgr_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+
+// rgb to yuv, standard c implementation
+void rgb24_yuv420_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *rgb, uint32_t rgb_stride, 
+	uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	YCbCrType yuv_type);
+
+// rgb to yuv, sse implementation
+// pointers must be 16 byte aligned, and strides must be divisible by 16
+void rgb24_yuv420_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *rgb, uint32_t rgb_stride, 
+	uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	YCbCrType yuv_type);
+
+// rgb to yuv, sse implementation
+// pointers do not need to be 16 byte aligned
+void rgb24_yuv420_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *rgb, uint32_t rgb_stride, 
+	uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	YCbCrType yuv_type);
+
--- a/src/video/yuv2rgb/yuv_rgb_sse_func.h
+++ b/src/video/yuv2rgb/yuv_rgb_sse_func.h
@@ -0,0 +1,498 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+
+/* You need to define the following macros before including this file:
+	SSE_FUNCTION_NAME
+	STD_FUNCTION_NAME
+	YUV_FORMAT
+	RGB_FORMAT
+*/
+/* You may define the following macro, which affects generated code:
+	SSE_ALIGNED
+*/
+
+#ifdef SSE_ALIGNED
+/* Unaligned instructions seem faster, even on aligned data? */
+/*
+#define LOAD_SI128 _mm_load_si128
+#define SAVE_SI128 _mm_stream_si128
+*/
+#define LOAD_SI128 _mm_loadu_si128
+#define SAVE_SI128 _mm_storeu_si128
+#else
+#define LOAD_SI128 _mm_loadu_si128
+#define SAVE_SI128 _mm_storeu_si128
+#endif
+
+#define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \
+	r_tmp = _mm_mullo_epi16(V, _mm_set1_epi16(param->v_r_factor)); \
+	g_tmp = _mm_add_epi16( \
+		_mm_mullo_epi16(U, _mm_set1_epi16(param->u_g_factor)), \
+		_mm_mullo_epi16(V, _mm_set1_epi16(param->v_g_factor))); \
+	b_tmp = _mm_mullo_epi16(U, _mm_set1_epi16(param->u_b_factor)); \
+	R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \
+	G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \
+	B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \
+	R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \
+	G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \
+	B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \
+
+#define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \
+	Y1 = _mm_mullo_epi16(_mm_sub_epi16(Y1, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
+	Y2 = _mm_mullo_epi16(_mm_sub_epi16(Y2, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
+	\
+	R1 = _mm_srai_epi16(_mm_add_epi16(R1, Y1), PRECISION); \
+	G1 = _mm_srai_epi16(_mm_add_epi16(G1, Y1), PRECISION); \
+	B1 = _mm_srai_epi16(_mm_add_epi16(B1, Y1), PRECISION); \
+	R2 = _mm_srai_epi16(_mm_add_epi16(R2, Y2), PRECISION); \
+	G2 = _mm_srai_epi16(_mm_add_epi16(G2, Y2), PRECISION); \
+	B2 = _mm_srai_epi16(_mm_add_epi16(B2, Y2), PRECISION); \
+
+#define PACK_RGB565_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4) \
+{ \
+	__m128i red_mask, tmp1, tmp2, tmp3, tmp4; \
+\
+	red_mask = _mm_set1_epi16(0xF800); \
+	RGB1 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R1), red_mask); \
+	RGB2 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R1), red_mask); \
+	RGB3 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R2), red_mask); \
+	RGB4 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R2), red_mask); \
+	tmp1 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G1, _mm_setzero_si128()), 2), 5); \
+	tmp2 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G1, _mm_setzero_si128()), 2), 5); \
+	tmp3 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G2, _mm_setzero_si128()), 2), 5); \
+	tmp4 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G2, _mm_setzero_si128()), 2), 5); \
+	RGB1 = _mm_or_si128(RGB1, tmp1); \
+	RGB2 = _mm_or_si128(RGB2, tmp2); \
+	RGB3 = _mm_or_si128(RGB3, tmp3); \
+	RGB4 = _mm_or_si128(RGB4, tmp4); \
+	tmp1 = _mm_srli_epi16(_mm_unpacklo_epi8(B1, _mm_setzero_si128()), 3); \
+	tmp2 = _mm_srli_epi16(_mm_unpackhi_epi8(B1, _mm_setzero_si128()), 3); \
+	tmp3 = _mm_srli_epi16(_mm_unpacklo_epi8(B2, _mm_setzero_si128()), 3); \
+	tmp4 = _mm_srli_epi16(_mm_unpackhi_epi8(B2, _mm_setzero_si128()), 3); \
+	RGB1 = _mm_or_si128(RGB1, tmp1); \
+	RGB2 = _mm_or_si128(RGB2, tmp2); \
+	RGB3 = _mm_or_si128(RGB3, tmp3); \
+	RGB4 = _mm_or_si128(RGB4, tmp4); \
+}
+
+#define PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+RGB1 = _mm_packus_epi16(_mm_and_si128(R1,_mm_set1_epi16(0xFF)), _mm_and_si128(R2,_mm_set1_epi16(0xFF))); \
+RGB2 = _mm_packus_epi16(_mm_and_si128(G1,_mm_set1_epi16(0xFF)), _mm_and_si128(G2,_mm_set1_epi16(0xFF))); \
+RGB3 = _mm_packus_epi16(_mm_and_si128(B1,_mm_set1_epi16(0xFF)), _mm_and_si128(B2,_mm_set1_epi16(0xFF))); \
+RGB4 = _mm_packus_epi16(_mm_srli_epi16(R1,8), _mm_srli_epi16(R2,8)); \
+RGB5 = _mm_packus_epi16(_mm_srli_epi16(G1,8), _mm_srli_epi16(G2,8)); \
+RGB6 = _mm_packus_epi16(_mm_srli_epi16(B1,8), _mm_srli_epi16(B2,8)); \
+
+#define PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+R1 = _mm_packus_epi16(_mm_and_si128(RGB1,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB2,_mm_set1_epi16(0xFF))); \
+R2 = _mm_packus_epi16(_mm_and_si128(RGB3,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB4,_mm_set1_epi16(0xFF))); \
+G1 = _mm_packus_epi16(_mm_and_si128(RGB5,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB6,_mm_set1_epi16(0xFF))); \
+G2 = _mm_packus_epi16(_mm_srli_epi16(RGB1,8), _mm_srli_epi16(RGB2,8)); \
+B1 = _mm_packus_epi16(_mm_srli_epi16(RGB3,8), _mm_srli_epi16(RGB4,8)); \
+B2 = _mm_packus_epi16(_mm_srli_epi16(RGB5,8), _mm_srli_epi16(RGB6,8)); \
+
+#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+
+#define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \
+{ \
+	__m128i lo_ab, hi_ab, lo_gr, hi_gr; \
+\
+	lo_ab = _mm_unpacklo_epi8( A1, B1 ); \
+	hi_ab = _mm_unpackhi_epi8( A1, B1 ); \
+	lo_gr = _mm_unpacklo_epi8( G1, R1 ); \
+	hi_gr = _mm_unpackhi_epi8( G1, R1 ); \
+	RGB1 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
+	RGB2 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
+	RGB3 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
+	RGB4 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
+\
+	lo_ab = _mm_unpacklo_epi8( A2, B2 ); \
+	hi_ab = _mm_unpackhi_epi8( A2, B2 ); \
+	lo_gr = _mm_unpacklo_epi8( G2, R2 ); \
+	hi_gr = _mm_unpackhi_epi8( G2, R2 ); \
+	RGB5 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
+	RGB6 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
+	RGB7 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
+	RGB8 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
+}
+
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+	\
+	PACK_RGB565_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4) \
+	\
+	PACK_RGB565_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_5, rgb_6, rgb_7, rgb_8) \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \
+	__m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \
+	\
+	PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \
+	\
+	PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGBA
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+	__m128i a = _mm_set1_epi8( 0xFF ); \
+	\
+	PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+	\
+	PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#elif RGB_FORMAT == RGB_FORMAT_BGRA
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+	__m128i a = _mm_set1_epi8( 0xFF ); \
+	\
+	PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+	\
+	PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#elif RGB_FORMAT == RGB_FORMAT_ARGB
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+	__m128i a = _mm_set1_epi8( 0xFF ); \
+	\
+	PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+	\
+	PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#elif RGB_FORMAT == RGB_FORMAT_ABGR
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+	__m128i a = _mm_set1_epi8( 0xFF ); \
+	\
+	PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+	\
+	PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#else
+#error PACK_PIXEL unimplemented
+#endif
+
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+
+#define SAVE_LINE1 \
+	SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
+
+#define SAVE_LINE2 \
+	SAVE_SI128((__m128i*)(rgb_ptr2), rgb_5); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_6); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_7); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_8); \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+
+#define SAVE_LINE1 \
+	SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
+
+#define SAVE_LINE2 \
+	SAVE_SI128((__m128i*)(rgb_ptr2), rgb_7); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_8); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_9); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_10); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_11); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_12); \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
+      RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
+
+#define SAVE_LINE1 \
+	SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+96), rgb_7); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+112), rgb_8); \
+
+#define SAVE_LINE2 \
+	SAVE_SI128((__m128i*)(rgb_ptr2), rgb_9); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_10); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_11); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_12); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_13); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_14); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+96), rgb_15); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+112), rgb_16); \
+
+#else
+#error SAVE_LINE unimplemented
+#endif
+
+#if YUV_FORMAT == YUV_FORMAT_420
+
+#define READ_Y(y_ptr) \
+	y = LOAD_SI128((const __m128i*)(y_ptr)); \
+
+#define READ_UV	\
+	u = LOAD_SI128((const __m128i*)(u_ptr)); \
+	v = LOAD_SI128((const __m128i*)(v_ptr)); \
+
+#elif YUV_FORMAT == YUV_FORMAT_422
+
+#define READ_Y(y_ptr) \
+{ \
+	__m128i y1, y2; \
+	y1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr)), 8), 8); \
+	y2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr+16)), 8), 8); \
+	y = _mm_packus_epi16(y1, y2); \
+}
+
+#define READ_UV	\
+{ \
+	__m128i u1, u2, u3, u4, v1, v2, v3, v4; \
+	u1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr)), 24), 24); \
+	u2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+16)), 24), 24); \
+	u3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+32)), 24), 24); \
+	u4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+48)), 24), 24); \
+	u = _mm_packus_epi16(_mm_packs_epi32(u1, u2), _mm_packs_epi32(u3, u4)); \
+	v1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr)), 24), 24); \
+	v2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+16)), 24), 24); \
+	v3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+32)), 24), 24); \
+	v4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+48)), 24), 24); \
+	v = _mm_packus_epi16(_mm_packs_epi32(v1, v2), _mm_packs_epi32(v3, v4)); \
+}
+
+#elif YUV_FORMAT == YUV_FORMAT_NV12
+
+#define READ_Y(y_ptr) \
+	y = LOAD_SI128((const __m128i*)(y_ptr)); \
+
+#define READ_UV	\
+{ \
+	__m128i u1, u2, v1, v2; \
+	u1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr)), 8), 8); \
+	u2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr+16)), 8), 8); \
+	u = _mm_packus_epi16(u1, u2); \
+	v1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr)), 8), 8); \
+	v2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr+16)), 8), 8); \
+	v = _mm_packus_epi16(v1, v2); \
+}
+
+#else
+#error READ_UV unimplemented
+#endif
+
+#define YUV2RGB_32 \
+	__m128i r_tmp, g_tmp, b_tmp; \
+	__m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \
+	__m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \
+	__m128i y_16_1, y_16_2; \
+	__m128i y, u, v, u_16, v_16; \
+    __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \
+    __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \
+	\
+	READ_UV \
+	\
+	/* process first 16 pixels of first line */\
+	u_16 = _mm_unpacklo_epi8(u, _mm_setzero_si128()); \
+	v_16 = _mm_unpacklo_epi8(v, _mm_setzero_si128()); \
+	u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
+	v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
+	\
+	UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
+	r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
+	\
+	READ_Y(y_ptr1) \
+	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+	\
+	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	\
+	r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \
+	g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \
+	b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \
+	\
+	/* process first 16 pixels of second line */\
+	r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
+	r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
+	\
+	READ_Y(y_ptr2) \
+	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+	\
+	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	\
+	r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \
+	g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \
+	b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \
+	\
+	/* process last 16 pixels of first line */\
+	u_16 = _mm_unpackhi_epi8(u, _mm_setzero_si128()); \
+	v_16 = _mm_unpackhi_epi8(v, _mm_setzero_si128()); \
+	u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
+	v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
+	\
+	UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
+	r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
+	\
+	READ_Y(y_ptr1+16*y_pixel_stride) \
+	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+	\
+	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	\
+	r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \
+	g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \
+	b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \
+	\
+	/* process last 16 pixels of second line */\
+	r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
+	r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
+	\
+	READ_Y(y_ptr2+16*y_pixel_stride) \
+	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+	\
+	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	\
+	r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \
+	g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \
+	b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \
+	\
+
+
+void SSE_FUNCTION_NAME(uint32_t width, uint32_t height, 
+	const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
+	uint8_t *RGB, uint32_t RGB_stride, 
+	YCbCrType yuv_type)
+{
+	const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
+#if YUV_FORMAT == YUV_FORMAT_420
+	const int y_pixel_stride = 1;
+	const int uv_pixel_stride = 1;
+	const int uv_x_sample_interval = 2;
+	const int uv_y_sample_interval = 2;
+#elif YUV_FORMAT == YUV_FORMAT_422
+	const int y_pixel_stride = 2;
+	const int uv_pixel_stride = 4;
+	const int uv_x_sample_interval = 2;
+	const int uv_y_sample_interval = 1;
+#elif YUV_FORMAT == YUV_FORMAT_NV12
+	const int y_pixel_stride = 1;
+	const int uv_pixel_stride = 2;
+	const int uv_x_sample_interval = 2;
+	const int uv_y_sample_interval = 2;
+#endif
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+	const int rgb_pixel_stride = 2;
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+	const int rgb_pixel_stride = 3;
+#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
+      RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
+	const int rgb_pixel_stride = 4;
+#else
+#error Unknown RGB pixel size
+#endif
+
+	if (width >= 32) {
+		uint32_t x, y;
+		for(y=0; y<(height-(uv_y_sample_interval-1)); y+=uv_y_sample_interval)
+		{
+			const uint8_t *y_ptr1=Y+y*Y_stride,
+				*y_ptr2=Y+(y+1)*Y_stride,
+				*u_ptr=U+(y/uv_y_sample_interval)*UV_stride,
+				*v_ptr=V+(y/uv_y_sample_interval)*UV_stride;
+			
+			uint8_t *rgb_ptr1=RGB+y*RGB_stride,
+				*rgb_ptr2=RGB+(y+1)*RGB_stride;
+			
+			for(x=0; x<(width-31); x+=32)
+			{
+				YUV2RGB_32
+				{
+					PACK_PIXEL
+					SAVE_LINE1
+					if (uv_y_sample_interval > 1)
+					{
+						SAVE_LINE2
+					}
+				}
+
+				y_ptr1+=32*y_pixel_stride;
+				y_ptr2+=32*y_pixel_stride;
+				u_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
+				v_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
+				rgb_ptr1+=32*rgb_pixel_stride;
+				rgb_ptr2+=32*rgb_pixel_stride;
+			}
+		}
+
+		/* Catch the last line, if needed */
+		if (uv_y_sample_interval == 2 && y == (height-1))
+		{
+			const uint8_t *y_ptr=Y+y*Y_stride,
+				*u_ptr=U+(y/uv_y_sample_interval)*UV_stride,
+				*v_ptr=V+(y/uv_y_sample_interval)*UV_stride;
+			
+			uint8_t *rgb_ptr=RGB+y*RGB_stride;
+
+			STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
+		}
+	}
+
+	/* Catch the right column, if needed */
+	{
+		int converted = (width & ~31);
+		if (converted != width)
+		{
+			const uint8_t *y_ptr=Y+converted*y_pixel_stride,
+				*u_ptr=U+converted*uv_pixel_stride/uv_x_sample_interval,
+				*v_ptr=V+converted*uv_pixel_stride/uv_x_sample_interval;
+			
+			uint8_t *rgb_ptr=RGB+converted*rgb_pixel_stride;
+
+			STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
+		}
+	}
+}
+
+#undef SSE_FUNCTION_NAME
+#undef STD_FUNCTION_NAME
+#undef YUV_FORMAT
+#undef RGB_FORMAT
+#undef SSE_ALIGNED
+#undef LOAD_SI128
+#undef SAVE_SI128
+#undef UV2RGB_16
+#undef ADD_Y2RGB_16
+#undef PACK_RGB24_32_STEP1
+#undef PACK_RGB24_32_STEP2
+#undef PACK_RGB24_32
+#undef PACK_RGBA_32
+#undef PACK_PIXEL
+#undef SAVE_LINE1
+#undef SAVE_LINE2
+#undef READ_Y
+#undef READ_UV
+#undef YUV2RGB_32
--- a/src/video/yuv2rgb/yuv_rgb_std_func.h
+++ b/src/video/yuv2rgb/yuv_rgb_std_func.h
@@ -0,0 +1,220 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+
+/* You need to define the following macros before including this file:
+	STD_FUNCTION_NAME
+	YUV_FORMAT
+	RGB_FORMAT
+*/
+
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+
+#define PACK_PIXEL(rgb_ptr) \
+	*(Uint16 *)rgb_ptr = \
+		((((Uint16)clampU8(y_tmp+r_tmp)) << 8 ) & 0xF800) | \
+		((((Uint16)clampU8(y_tmp+g_tmp)) << 3) & 0x07E0) | \
+		(((Uint16)clampU8(y_tmp+b_tmp)) >> 3); \
+	rgb_ptr += 2; \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+
+#define PACK_PIXEL(rgb_ptr) \
+	rgb_ptr[0] = clampU8(y_tmp+r_tmp); \
+	rgb_ptr[1] = clampU8(y_tmp+g_tmp); \
+	rgb_ptr[2] = clampU8(y_tmp+b_tmp); \
+	rgb_ptr += 3; \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGBA
+
+#define PACK_PIXEL(rgb_ptr) \
+	*(Uint32 *)rgb_ptr = \
+		(((Uint32)clampU8(y_tmp+r_tmp)) << 24) | \
+		(((Uint32)clampU8(y_tmp+g_tmp)) << 16) | \
+		(((Uint32)clampU8(y_tmp+b_tmp)) << 8) | \
+		0x000000FF; \
+	rgb_ptr += 4; \
+
+#elif RGB_FORMAT == RGB_FORMAT_BGRA
+
+#define PACK_PIXEL(rgb_ptr) \
+	*(Uint32 *)rgb_ptr = \
+		(((Uint32)clampU8(y_tmp+b_tmp)) << 24) | \
+		(((Uint32)clampU8(y_tmp+g_tmp)) << 16) | \
+		(((Uint32)clampU8(y_tmp+r_tmp)) << 8) | \
+		0x000000FF; \
+	rgb_ptr += 4; \
+
+#elif RGB_FORMAT == RGB_FORMAT_ARGB
+
+#define PACK_PIXEL(rgb_ptr) \
+	*(Uint32 *)rgb_ptr = \
+		0xFF000000 | \
+		(((Uint32)clampU8(y_tmp+r_tmp)) << 16) | \
+		(((Uint32)clampU8(y_tmp+g_tmp)) << 8) | \
+		(((Uint32)clampU8(y_tmp+b_tmp)) << 0); \
+	rgb_ptr += 4; \
+
+#elif RGB_FORMAT == RGB_FORMAT_ABGR
+
+#define PACK_PIXEL(rgb_ptr) \
+	*(Uint32 *)rgb_ptr = \
+		0xFF000000 | \
+		(((Uint32)clampU8(y_tmp+b_tmp)) << 16) | \
+		(((Uint32)clampU8(y_tmp+g_tmp)) << 8) | \
+		(((Uint32)clampU8(y_tmp+r_tmp)) << 0); \
+	rgb_ptr += 4; \
+
+#else
+#error PACK_PIXEL unimplemented
+#endif
+
+
+void STD_FUNCTION_NAME(
+	uint32_t width, uint32_t height, 
+	const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
+	uint8_t *RGB, uint32_t RGB_stride, 
+	YCbCrType yuv_type)
+{
+	const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
+#if YUV_FORMAT == YUV_FORMAT_420
+	const int y_pixel_stride = 1;
+	const int uv_pixel_stride = 1;
+	const int uv_x_sample_interval = 2;
+	const int uv_y_sample_interval = 2;
+#elif YUV_FORMAT == YUV_FORMAT_422
+	const int y_pixel_stride = 2;
+	const int uv_pixel_stride = 4;
+	const int uv_x_sample_interval = 2;
+	const int uv_y_sample_interval = 1;
+#elif YUV_FORMAT == YUV_FORMAT_NV12
+	const int y_pixel_stride = 1;
+	const int uv_pixel_stride = 2;
+	const int uv_x_sample_interval = 2;
+	const int uv_y_sample_interval = 2;
+#endif
+
+	uint32_t x, y;
+	for(y=0; y<(height-(uv_y_sample_interval-1)); y+=uv_y_sample_interval)
+	{
+		const uint8_t *y_ptr1=Y+y*Y_stride,
+			*y_ptr2=Y+(y+1)*Y_stride,
+			*u_ptr=U+(y/uv_y_sample_interval)*UV_stride,
+			*v_ptr=V+(y/uv_y_sample_interval)*UV_stride;
+		
+		uint8_t *rgb_ptr1=RGB+y*RGB_stride,
+			*rgb_ptr2=RGB+(y+1)*RGB_stride;
+		
+		for(x=0; x<(width-(uv_x_sample_interval-1)); x+=uv_x_sample_interval)
+		{
+			// Compute U and V contributions, common to the four pixels
+			
+			int32_t u_tmp = ((*u_ptr)-128);
+			int32_t v_tmp = ((*v_ptr)-128);
+			
+			int32_t r_tmp = (v_tmp*param->v_r_factor);
+			int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+			int32_t b_tmp = (u_tmp*param->u_b_factor);
+			
+			// Compute the Y contribution for each pixel
+			
+			int32_t y_tmp = ((y_ptr1[0]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+			
+			y_tmp = ((y_ptr1[y_pixel_stride]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+			
+			if (uv_y_sample_interval > 1) {
+				y_tmp = ((y_ptr2[0]-param->y_shift)*param->y_factor);
+				PACK_PIXEL(rgb_ptr2);
+				
+				y_tmp = ((y_ptr2[y_pixel_stride]-param->y_shift)*param->y_factor);
+				PACK_PIXEL(rgb_ptr2);
+			}
+
+			y_ptr1+=2*y_pixel_stride;
+			y_ptr2+=2*y_pixel_stride;
+			u_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+			v_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+		}
+
+		/* Catch the last pixel, if needed */
+		if (uv_x_sample_interval == 2 && x == (width-1))
+		{
+			// Compute U and V contributions, common to the four pixels
+			
+			int32_t u_tmp = ((*u_ptr)-128);
+			int32_t v_tmp = ((*v_ptr)-128);
+			
+			int32_t r_tmp = (v_tmp*param->v_r_factor);
+			int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+			int32_t b_tmp = (u_tmp*param->u_b_factor);
+			
+			// Compute the Y contribution for each pixel
+			
+			int32_t y_tmp = ((y_ptr1[0]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+			
+			if (uv_y_sample_interval > 1) {
+				y_tmp = ((y_ptr2[0]-param->y_shift)*param->y_factor);
+				PACK_PIXEL(rgb_ptr2);
+			}
+		}
+	}
+
+	/* Catch the last line, if needed */
+	if (uv_y_sample_interval == 2 && y == (height-1))
+	{
+		const uint8_t *y_ptr1=Y+y*Y_stride,
+			*u_ptr=U+(y/uv_y_sample_interval)*UV_stride,
+			*v_ptr=V+(y/uv_y_sample_interval)*UV_stride;
+		
+		uint8_t *rgb_ptr1=RGB+y*RGB_stride;
+		
+		for(x=0; x<(width-(uv_x_sample_interval-1)); x+=uv_x_sample_interval)
+		{
+			// Compute U and V contributions, common to the four pixels
+			
+			int32_t u_tmp = ((*u_ptr)-128);
+			int32_t v_tmp = ((*v_ptr)-128);
+			
+			int32_t r_tmp = (v_tmp*param->v_r_factor);
+			int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+			int32_t b_tmp = (u_tmp*param->u_b_factor);
+			
+			// Compute the Y contribution for each pixel
+			
+			int32_t y_tmp = ((y_ptr1[0]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+			
+			y_tmp = ((y_ptr1[y_pixel_stride]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+			
+			y_ptr1+=2*y_pixel_stride;
+			u_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+			v_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+		}
+
+		/* Catch the last pixel, if needed */
+		if (uv_x_sample_interval == 2 && x == (width-1))
+		{
+			// Compute U and V contributions, common to the four pixels
+			
+			int32_t u_tmp = ((*u_ptr)-128);
+			int32_t v_tmp = ((*v_ptr)-128);
+			
+			int32_t r_tmp = (v_tmp*param->v_r_factor);
+			int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+			int32_t b_tmp = (u_tmp*param->u_b_factor);
+			
+			// Compute the Y contribution for each pixel
+			
+			int32_t y_tmp = ((y_ptr1[0]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+		}
+	}
+}
+
+#undef STD_FUNCTION_NAME
+#undef YUV_FORMAT
+#undef RGB_FORMAT
+#undef PACK_PIXEL