Move xxhash to boo; cached shader components

2025-12-08 21:17:50 +00:00 · 2017-03-04 21:54:58 -10:00
parent 8bcac27c10
commit 03f155fcf5
16 changed files with 1558 additions and 217 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,8 @@ if (NOT TARGET logvisor)
    add_subdirectory(logvisor)
 endif()
 add_subdirectory(xxhash)
 set(WITH_LSR_BINDINGS OFF)
 set(BUILD_TESTS OFF)
 set(BUILD_SHARED_LIBS OFF)
@@ -18,7 +20,7 @@ add_subdirectory(soxr)
 set(BOO_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE PATH "boo include path" FORCE)
-include_directories(include ${LOGVISOR_INCLUDE_DIR})
+include_directories(include xxhash ${LOGVISOR_INCLUDE_DIR})
 if(NOT GEKKO AND NOT CAFE)
 list(APPEND PLAT_SRCS
@@ -195,7 +197,7 @@ target_include_directories(glslang-default-resource-limits
    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/glslang
 )
-list(APPEND _BOO_SYS_LIBS glslang HLSL soxr OSDependent OGLCompiler SPIRV glslang-default-resource-limits)
+list(APPEND _BOO_SYS_LIBS glslang HLSL soxr xxhash OSDependent OGLCompiler SPIRV glslang-default-resource-limits)
 set(BOO_SYS_LIBS ${_BOO_SYS_LIBS} CACHE PATH "boo system libraries" FORCE)
 set(BOO_SYS_DEFINES ${_BOO_SYS_DEFINES} CACHE PATH "boo system defines" FORCE)
--- a/include/boo/graphicsdev/GL.hpp
+++ b/include/boo/graphicsdev/GL.hpp
@@ -5,39 +5,16 @@
 #include "IGraphicsCommandQueue.hpp"
 #include "boo/IGraphicsContext.hpp"
 #include "GLSLMacros.hpp"
 #include <vector>
 #include <unordered_set>
 #include <unordered_map>
 #include <mutex>
 namespace boo
 {
 class GLDataFactory : public IGraphicsDataFactory
 {
    friend struct GLCommandQueue;
    IGraphicsContext* m_parent;
    uint32_t m_drawSamples;
    static ThreadLocalPtr<struct GLData> m_deferredData;
    std::unordered_set<struct GLData*> m_committedData;
    std::unordered_set<struct GLPool*> m_committedPools;
    std::mutex m_committedMutex;
    void destroyData(IGraphicsData*);
    void destroyAllData();
    void destroyPool(IGraphicsBufferPool*);
    IGraphicsBufferD* newPoolBuffer(IGraphicsBufferPool* pool, BufferUse use,
                                    size_t stride, size_t count);
    void deletePoolBuffer(IGraphicsBufferPool* p, IGraphicsBufferD* buf);
 public:
    GLDataFactory(IGraphicsContext* parent, uint32_t drawSamples);
    ~GLDataFactory() {destroyAllData();}
    Platform platform() const {return Platform::OpenGL;}
    const SystemChar* platformName() const {return _S("OpenGL");}
    class Context : public IGraphicsDataFactory::Context
    {
-        friend class GLDataFactory;
+        friend class GLDataFactoryImpl;
        GLDataFactory& m_parent;
        Context(GLDataFactory& parent) : m_parent(parent) {}
    public:
@@ -73,9 +50,6 @@ public:
                             const size_t* ubufOffs, const size_t* ubufSizes,
                             size_t texCount, ITexture** texs, size_t baseVert = 0, size_t baseInst = 0);
    };
    GraphicsDataToken commitTransaction(const FactoryCommitFunc&);
    GraphicsBufferPoolToken newBufferPool();
 };
 }
--- a/include/boo/graphicsdev/IGraphicsCommandQueue.hpp
+++ b/include/boo/graphicsdev/IGraphicsCommandQueue.hpp
@@ -10,7 +10,7 @@ namespace boo
 struct IGraphicsCommandQueue
 {
-    virtual ~IGraphicsCommandQueue() {}
+    virtual ~IGraphicsCommandQueue() = default;
    using Platform = IGraphicsDataFactory::Platform;
    virtual Platform platform() const=0;
--- a/include/boo/graphicsdev/IGraphicsDataFactory.hpp
+++ b/include/boo/graphicsdev/IGraphicsDataFactory.hpp
@@ -17,7 +17,7 @@ struct IGraphicsBuffer
 protected:
    bool m_dynamic;
    IGraphicsBuffer(bool dynamic) : m_dynamic(dynamic) {}
-    virtual ~IGraphicsBuffer() {}
+    virtual ~IGraphicsBuffer() = default;
 };
 /** Static resource buffer for verts, indices, uniform constants */
@@ -190,7 +190,7 @@ enum class BlendFactor
 /** Factory object for creating batches of resources as an IGraphicsData token */
 struct IGraphicsDataFactory
 {
-    virtual ~IGraphicsDataFactory() {}
+    virtual ~IGraphicsDataFactory() = default;
    enum class Platform
    {
@@ -276,10 +276,10 @@ using FactoryCommitFunc = std::function<bool(IGraphicsDataFactory::Context& ctx)
 *  IGraphicsData (please don't delete and draw contained resources in the same frame). */
 class GraphicsDataToken
 {
-    friend class GLDataFactory;
+    friend class GLDataFactoryImpl;
    friend class D3D12DataFactory;
    friend class D3D11DataFactory;
-    friend class MetalDataFactory;
+    friend class MetalDataFactoryImpl;
    friend class VulkanDataFactory;
    IGraphicsDataFactory* m_factory = nullptr;
    IGraphicsData* m_data = nullptr;
@@ -323,10 +323,10 @@ public:
 *  (please don't delete and draw contained resources in the same frame). */
 class GraphicsBufferPoolToken
 {
-    friend class GLDataFactory;
+    friend class GLDataFactoryImpl;
    friend class D3D12DataFactory;
    friend class D3D11DataFactory;
-    friend class MetalDataFactory;
+    friend class MetalDataFactoryImpl;
    friend class VulkanDataFactory;
    IGraphicsDataFactory* m_factory = nullptr;
    IGraphicsBufferPool* m_pool = nullptr;
--- a/include/boo/graphicsdev/Metal.hpp
+++ b/include/boo/graphicsdev/Metal.hpp
@@ -6,42 +6,16 @@
 #include "IGraphicsDataFactory.hpp"
 #include "IGraphicsCommandQueue.hpp"
 #include "boo/IGraphicsContext.hpp"
 #include <vector>
 #include <mutex>
 #include <unordered_set>
 #include <unordered_map>
 namespace boo
 {
 struct MetalContext;
 class MetalDataFactory : public IGraphicsDataFactory
 {
    friend struct MetalCommandQueue;
    IGraphicsContext* m_parent;
    static ThreadLocalPtr<struct MetalData> m_deferredData;
    std::unordered_set<struct MetalData*> m_committedData;
    std::unordered_set<struct MetalPool*> m_committedPools;
    std::mutex m_committedMutex;
    struct MetalContext* m_ctx;
    uint32_t m_sampleCount;
    void destroyData(IGraphicsData*);
    void destroyAllData();
    void destroyPool(IGraphicsBufferPool*);
    IGraphicsBufferD* newPoolBuffer(IGraphicsBufferPool* pool, BufferUse use,
                                    size_t stride, size_t count);
    void deletePoolBuffer(IGraphicsBufferPool* p, IGraphicsBufferD* buf);
 public:
    MetalDataFactory(IGraphicsContext* parent, MetalContext* ctx, uint32_t sampleCount);
    ~MetalDataFactory() {}
    Platform platform() const {return Platform::Metal;}
    const char* platformName() const {return "Metal";}
    class Context : public IGraphicsDataFactory::Context
    {
-        friend class MetalDataFactory;
+        friend class MetalDataFactoryImpl;
        MetalDataFactory& m_parent;
        Context(MetalDataFactory& parent) : m_parent(parent) {}
    public:
@@ -76,9 +50,6 @@ public:
                             const size_t* ubufOffs, const size_t* ubufSizes,
                             size_t texCount, ITexture** texs, size_t baseVert = 0, size_t baseInst = 0);
    };
    GraphicsDataToken commitTransaction(const std::function<bool(IGraphicsDataFactory::Context& ctx)>&);
    GraphicsBufferPoolToken newBufferPool();
 };
 }
--- a/lib/graphicsdev/Common.hpp
+++ b/lib/graphicsdev/Common.hpp
@@ -50,6 +50,44 @@ public:
    Token lock() const { return Token(this); }
 };
 template <class FactoryImpl, class ShaderImpl>
 class IShareableShader
 {
    std::atomic_int m_refCount = {0};
    FactoryImpl& m_factory;
    uint64_t m_key;
 public:
    IShareableShader(FactoryImpl& factory, uint64_t key)
    : m_factory(factory), m_key(key) {}
    void increment() { m_refCount++; }
    void decrement()
    {
        if (m_refCount.fetch_sub(1) == 1)
            m_factory._unregisterShareableShader(m_key);
    }
    class Token
    {
        IShareableShader<FactoryImpl, ShaderImpl>* m_parent = nullptr;
    public:
        Token() = default;
        Token(IShareableShader* p)
        : m_parent(p)
        { m_parent->increment(); }
        Token& operator=(const Token&) = delete;
        Token(const Token&) = delete;
        Token& operator=(Token&& other)
        { m_parent = other.m_parent; other.m_parent = nullptr; return *this; }
        Token(Token&& other)
        { m_parent = other.m_parent; other.m_parent = nullptr; }
        ~Token() { if (m_parent) m_parent->decrement(); }
        operator bool() const { return m_parent != nullptr; }
        ShaderImpl& get() const { return static_cast<ShaderImpl&>(*m_parent); }
    };
    Token lock() { return Token(this); }
 };
 }
 #endif // BOO_GRAPHICSDEV_COMMON_HPP
--- a/lib/graphicsdev/GL.cpp
+++ b/lib/graphicsdev/GL.cpp
@@ -8,7 +8,10 @@
 #include <condition_variable>
 #include <array>
 #include <unordered_map>
 #include <unordered_set>
 #include <atomic>
 #include <functional>
 #include "xxhash.h"
 #include "logvisor/logvisor.hpp"
@@ -18,8 +21,47 @@
 namespace boo
 {
 static logvisor::Module Log("boo::GL");
 class GLDataFactoryImpl;
-ThreadLocalPtr<struct GLData> GLDataFactory::m_deferredData;
+struct GLShareableShader : IShareableShader<GLDataFactoryImpl, GLShareableShader>
 {
    GLuint m_shader = 0;
    GLShareableShader(GLDataFactoryImpl& fac, uint64_t key, GLuint s)
    : IShareableShader(fac, key), m_shader(s) {}
    ~GLShareableShader() { glDeleteShader(m_shader); }
 };
 class GLDataFactoryImpl : public GLDataFactory
 {
    friend struct GLCommandQueue;
    friend class GLDataFactory::Context;
    IGraphicsContext* m_parent;
    uint32_t m_drawSamples;
    static ThreadLocalPtr<struct GLData> m_deferredData;
    std::unordered_set<struct GLData*> m_committedData;
    std::unordered_set<struct GLPool*> m_committedPools;
    std::mutex m_committedMutex;
    std::unordered_map<uint64_t, std::unique_ptr<GLShareableShader>> m_sharedShaders;
    void destroyData(IGraphicsData*);
    void destroyAllData();
    void destroyPool(IGraphicsBufferPool*);
    IGraphicsBufferD* newPoolBuffer(IGraphicsBufferPool* pool, BufferUse use,
                                    size_t stride, size_t count);
    void deletePoolBuffer(IGraphicsBufferPool* p, IGraphicsBufferD* buf);
 public:
    GLDataFactoryImpl(IGraphicsContext* parent, uint32_t drawSamples);
    ~GLDataFactoryImpl() {destroyAllData();}
    Platform platform() const {return Platform::OpenGL;}
    const SystemChar* platformName() const {return _S("OpenGL");}
    GraphicsDataToken commitTransaction(const FactoryCommitFunc&);
    GraphicsBufferPoolToken newBufferPool();
    void _unregisterShareableShader(uint64_t key) { m_sharedShaders.erase(key); }
 };
 ThreadLocalPtr<struct GLData> GLDataFactoryImpl::m_deferredData;
 struct GLData : IGraphicsDataPriv<GLData>
 {
    std::vector<std::unique_ptr<class GLShaderPipeline>> m_SPs;
@@ -75,6 +117,7 @@ public:
 class GLGraphicsBufferD : public IGraphicsBufferD
 {
    friend class GLDataFactory;
    friend class GLDataFactoryImpl;
    friend struct GLCommandQueue;
    GLuint m_bufs[3];
    GLenum m_target;
@@ -109,7 +152,7 @@ IGraphicsBufferS*
 GLDataFactory::Context::newStaticBuffer(BufferUse use, const void* data, size_t stride, size_t count)
 {
    GLGraphicsBufferS* retval = new GLGraphicsBufferS(use, data, stride * count);
-    m_deferredData->m_SBufs.emplace_back(retval);
+    GLDataFactoryImpl::m_deferredData->m_SBufs.emplace_back(retval);
    return retval;
 }
@@ -344,7 +387,7 @@ GLDataFactory::Context::newStaticTexture(size_t width, size_t height, size_t mip
                                         const void* data, size_t sz)
 {
    GLTextureS* retval = new GLTextureS(width, height, mips, fmt, data, sz);
-    m_deferredData->m_STexs.emplace_back(retval);
+    GLDataFactoryImpl::m_deferredData->m_STexs.emplace_back(retval);
    return retval;
 }
@@ -353,7 +396,7 @@ GLDataFactory::Context::newStaticArrayTexture(size_t width, size_t height, size_
                                              TextureFormat fmt, const void *data, size_t sz)
 {
    GLTextureSA* retval = new GLTextureSA(width, height, layers, mips, fmt, data, sz);
-    m_deferredData->m_SATexs.emplace_back(retval);
+    GLDataFactoryImpl::m_deferredData->m_SATexs.emplace_back(retval);
    return retval;
 }
@@ -362,8 +405,8 @@ class GLShaderPipeline : public IShaderPipeline
    friend class GLDataFactory;
    friend struct GLCommandQueue;
    friend struct GLShaderDataBinding;
-    GLuint m_vert = 0;
+    GLShareableShader::Token m_vert;
-    GLuint m_frag = 0;
+    GLShareableShader::Token m_frag;
    GLuint m_prog = 0;
    GLenum m_sfactor = GL_ONE;
    GLenum m_dfactor = GL_ZERO;
@@ -372,48 +415,17 @@ class GLShaderPipeline : public IShaderPipeline
    bool m_depthWrite = true;
    bool m_backfaceCulling = true;
    std::vector<GLint> m_uniLocs;
    bool initObjects()
    {
        m_vert = glCreateShader(GL_VERTEX_SHADER);
        m_frag = glCreateShader(GL_FRAGMENT_SHADER);
        m_prog = glCreateProgram();
        if (!m_vert || !m_frag || !m_prog)
        {
            glDeleteShader(m_vert);
            m_vert = 0;
            glDeleteShader(m_frag);
            m_frag = 0;
            glDeleteProgram(m_prog);
            m_prog = 0;
            return false;
        }
        glAttachShader(m_prog, m_vert);
        glAttachShader(m_prog, m_frag);
        return true;
    }
    void clearObjects()
    {
        if (m_vert)
            glDeleteShader(m_vert);
        if (m_frag)
            glDeleteShader(m_frag);
        if (m_prog)
            glDeleteProgram(m_prog);
    }
    GLShaderPipeline() = default;
 public:
    operator bool() const {return m_prog != 0;}
-    ~GLShaderPipeline() {clearObjects();}
+    ~GLShaderPipeline() { glDeleteProgram(m_prog); }
    GLShaderPipeline& operator=(const GLShaderPipeline&) = delete;
    GLShaderPipeline(const GLShaderPipeline&) = delete;
    GLShaderPipeline& operator=(GLShaderPipeline&& other)
    {
-        m_vert = other.m_vert;
+        m_vert = std::move(other.m_vert);
-        other.m_vert = 0;
+        m_frag = std::move(other.m_frag);
-        m_frag = other.m_frag;
+        m_prog = std::move(other.m_prog);
        other.m_frag = 0;
        m_prog = other.m_prog;
        other.m_prog = 0;
        m_sfactor = other.m_sfactor;
        m_dfactor = other.m_dfactor;
        m_depthTest = other.m_depthTest;
@@ -482,47 +494,95 @@ IShaderPipeline* GLDataFactory::Context::newShaderPipeline
 BlendFactor srcFac, BlendFactor dstFac, Primitive prim,
 bool depthTest, bool depthWrite, bool backfaceCulling)
 {
    GLDataFactoryImpl& factory = static_cast<GLDataFactoryImpl&>(m_parent);
    GLShaderPipeline shader;
    if (!shader.initObjects())
    {
        Log.report(logvisor::Error, "unable to create shader objects\n");
        return nullptr;
    }
    shader.m_sfactor = BLEND_FACTOR_TABLE[int(srcFac)];
    shader.m_dfactor = BLEND_FACTOR_TABLE[int(dstFac)];
    shader.m_depthTest = depthTest;
    shader.m_depthWrite = depthWrite;
    shader.m_backfaceCulling = backfaceCulling;
    shader.m_drawPrim = PRIMITIVE_TABLE[int(prim)];
-    glShaderSource(shader.m_vert, 1, &vertSource, nullptr);
+    XXH64_state_t hashState;
-    glCompileShader(shader.m_vert);
+    uint64_t hashes[2];
    XXH64_reset(&hashState, 0);
    XXH64_update(&hashState, vertSource, strlen(vertSource));
    hashes[0] = XXH64_digest(&hashState);
    XXH64_reset(&hashState, 0);
    XXH64_update(&hashState, fragSource, strlen(fragSource));
    hashes[1] = XXH64_digest(&hashState);
    GLint status;
-    glGetShaderiv(shader.m_vert, GL_COMPILE_STATUS, &status);
+    auto vertFind = factory.m_sharedShaders.find(hashes[0]);
-    if (status != GL_TRUE)
+    if (vertFind != factory.m_sharedShaders.end())
    {
-        GLint logLen;
+        shader.m_vert = vertFind->second->lock();
-        glGetShaderiv(shader.m_vert, GL_INFO_LOG_LENGTH, &logLen);
+    }
-        char* log = (char*)malloc(logLen);
+    else
-        glGetShaderInfoLog(shader.m_vert, logLen, nullptr, log);
+    {
-        Log.report(logvisor::Error, "unable to compile vert source\n%s\n%s\n", log, vertSource);
+        GLuint sobj = glCreateShader(GL_VERTEX_SHADER);
-        free(log);
+        if (!sobj)
        {
            Log.report(logvisor::Error, "unable to create vert shader");
            return nullptr;
        }
        glShaderSource(sobj, 1, &vertSource, nullptr);
        glCompileShader(sobj);
        glGetShaderiv(sobj, GL_COMPILE_STATUS, &status);
        if (status != GL_TRUE)
        {
            GLint logLen;
            glGetShaderiv(sobj, GL_INFO_LOG_LENGTH, &logLen);
            char* log = (char*)malloc(logLen);
            glGetShaderInfoLog(sobj, logLen, nullptr, log);
            Log.report(logvisor::Error, "unable to compile vert source\n%s\n%s\n", log, vertSource);
            free(log);
            return nullptr;
        }
        auto it =
        factory.m_sharedShaders.emplace(std::make_pair(hashes[0],
            std::make_unique<GLShareableShader>(factory, hashes[0], sobj))).first;
        shader.m_vert = it->second->lock();
    }
    auto fragFind = factory.m_sharedShaders.find(hashes[1]);
    if (fragFind != factory.m_sharedShaders.end())
    {
        shader.m_frag = fragFind->second->lock();
    }
    else
    {
        GLuint sobj = glCreateShader(GL_FRAGMENT_SHADER);
        if (!sobj)
        {
            Log.report(logvisor::Error, "unable to create frag shader");
            return nullptr;
        }
        glShaderSource(sobj, 1, &fragSource, nullptr);
        glCompileShader(sobj);
        glGetShaderiv(sobj, GL_COMPILE_STATUS, &status);
        if (status != GL_TRUE)
        {
            GLint logLen;
            glGetShaderiv(sobj, GL_INFO_LOG_LENGTH, &logLen);
            char* log = (char*)malloc(logLen);
            glGetShaderInfoLog(sobj, logLen, nullptr, log);
            Log.report(logvisor::Error, "unable to compile frag source\n%s\n%s\n", log, fragSource);
            free(log);
            return nullptr;
        }
        auto it =
        factory.m_sharedShaders.emplace(std::make_pair(hashes[1],
            std::make_unique<GLShareableShader>(factory, hashes[1], sobj))).first;
        shader.m_frag = it->second->lock();
    }
    shader.m_prog = glCreateProgram();
    if (!shader.m_prog)
    {
        Log.report(logvisor::Error, "unable to create shader program");
        return nullptr;
    }
-    glShaderSource(shader.m_frag, 1, &fragSource, nullptr);
+    glAttachShader(shader.m_prog, shader.m_vert.get().m_shader);
-    glCompileShader(shader.m_frag);
+    glAttachShader(shader.m_prog, shader.m_frag.get().m_shader);
    glGetShaderiv(shader.m_frag, GL_COMPILE_STATUS, &status);
    if (status != GL_TRUE)
    {
        GLint logLen;
        glGetShaderiv(shader.m_frag, GL_INFO_LOG_LENGTH, &logLen);
        char* log = (char*)malloc(logLen);
        glGetShaderInfoLog(shader.m_frag, logLen, nullptr, log);
        Log.report(logvisor::Error, "unable to compile frag source\n%s\n%s\n", log, fragSource);
        free(log);
        return nullptr;
    }
    glLinkProgram(shader.m_prog);
    glGetProgramiv(shader.m_prog, GL_LINK_STATUS, &status);
@@ -563,8 +623,15 @@ IShaderPipeline* GLDataFactory::Context::newShaderPipeline
        }
    }
    shader.m_sfactor = BLEND_FACTOR_TABLE[int(srcFac)];
    shader.m_dfactor = BLEND_FACTOR_TABLE[int(dstFac)];
    shader.m_depthTest = depthTest;
    shader.m_depthWrite = depthWrite;
    shader.m_backfaceCulling = backfaceCulling;
    shader.m_drawPrim = PRIMITIVE_TABLE[int(prim)];
    GLShaderPipeline* retval = new GLShaderPipeline(std::move(shader));
-    m_deferredData->m_SPs.emplace_back(retval);
+    GLDataFactoryImpl::m_deferredData->m_SPs.emplace_back(retval);
    return retval;
 }
@@ -699,17 +766,17 @@ GLDataFactory::Context::newShaderDataBinding(IShaderPipeline* pipeline,
                                             size_t texCount, ITexture** texs, size_t baseVert, size_t baseInst)
 {
    GLShaderDataBinding* retval =
-    new GLShaderDataBinding(m_deferredData.get(), pipeline, vtxFormat, ubufCount, ubufs,
+    new GLShaderDataBinding(GLDataFactoryImpl::m_deferredData.get(), pipeline, vtxFormat, ubufCount, ubufs,
                            ubufOffs, ubufSizes, texCount, texs);
-    m_deferredData->m_SBinds.emplace_back(retval);
+    GLDataFactoryImpl::m_deferredData->m_SBinds.emplace_back(retval);
    return retval;
 }
-GLDataFactory::GLDataFactory(IGraphicsContext* parent, uint32_t drawSamples)
+GLDataFactoryImpl::GLDataFactoryImpl(IGraphicsContext* parent, uint32_t drawSamples)
 : m_parent(parent), m_drawSamples(drawSamples) {}
-GraphicsDataToken GLDataFactory::commitTransaction(const FactoryCommitFunc& trans)
+GraphicsDataToken GLDataFactoryImpl::commitTransaction(const FactoryCommitFunc& trans)
 {
    if (m_deferredData.get())
        Log.report(logvisor::Fatal, "nested commitTransaction usage detected");
@@ -736,7 +803,7 @@ GraphicsDataToken GLDataFactory::commitTransaction(const FactoryCommitFunc& tran
    return GraphicsDataToken(this, retval);
 }
-GraphicsBufferPoolToken GLDataFactory::newBufferPool()
+GraphicsBufferPoolToken GLDataFactoryImpl::newBufferPool()
 {
    std::unique_lock<std::mutex> lk(m_committedMutex);
    GLPool* retval = new GLPool;
@@ -744,7 +811,7 @@ GraphicsBufferPoolToken GLDataFactory::newBufferPool()
    return GraphicsBufferPoolToken(this, retval);
 }
-void GLDataFactory::destroyData(IGraphicsData* d)
+void GLDataFactoryImpl::destroyData(IGraphicsData* d)
 {
    std::unique_lock<std::mutex> lk(m_committedMutex);
    GLData* data = static_cast<GLData*>(d);
@@ -752,7 +819,7 @@ void GLDataFactory::destroyData(IGraphicsData* d)
    data->decrement();
 }
-void GLDataFactory::destroyAllData()
+void GLDataFactoryImpl::destroyAllData()
 {
    std::unique_lock<std::mutex> lk(m_committedMutex);
    for (GLData* data : m_committedData)
@@ -763,7 +830,7 @@ void GLDataFactory::destroyAllData()
    m_committedPools.clear();
 }
-void GLDataFactory::destroyPool(IGraphicsBufferPool* p)
+void GLDataFactoryImpl::destroyPool(IGraphicsBufferPool* p)
 {
    std::unique_lock<std::mutex> lk(m_committedMutex);
    GLPool* pool = static_cast<GLPool*>(p);
@@ -771,8 +838,8 @@ void GLDataFactory::destroyPool(IGraphicsBufferPool* p)
    delete pool;
 }
-IGraphicsBufferD* GLDataFactory::newPoolBuffer(IGraphicsBufferPool* p, BufferUse use,
+IGraphicsBufferD* GLDataFactoryImpl::newPoolBuffer(IGraphicsBufferPool* p, BufferUse use,
-                                               size_t stride, size_t count)
+                                                   size_t stride, size_t count)
 {
    GLPool* pool = static_cast<GLPool*>(p);
    GLGraphicsBufferD* retval = new GLGraphicsBufferD(use, stride * count);
@@ -780,7 +847,7 @@ IGraphicsBufferD* GLDataFactory::newPoolBuffer(IGraphicsBufferPool* p, BufferUse
    return retval;
 }
-void GLDataFactory::deletePoolBuffer(IGraphicsBufferPool *p, IGraphicsBufferD *buf)
+void GLDataFactoryImpl::deletePoolBuffer(IGraphicsBufferPool *p, IGraphicsBufferD *buf)
 {
    GLPool* pool = static_cast<GLPool*>(p);
    pool->m_DBufs.erase(static_cast<GLGraphicsBufferD*>(buf));
@@ -1336,7 +1403,7 @@ struct GLCommandQueue : IGraphicsCommandQueue
        }
        /* Update dynamic data here */
-        GLDataFactory* gfxF = static_cast<GLDataFactory*>(m_parent->getDataFactory());
+        GLDataFactoryImpl* gfxF = static_cast<GLDataFactoryImpl*>(m_parent->getDataFactory());
        std::unique_lock<std::mutex> datalk(gfxF->m_committedMutex);
        for (GLData* d : gfxF->m_committedData)
        {
@@ -1403,7 +1470,7 @@ IGraphicsBufferD*
 GLDataFactory::Context::newDynamicBuffer(BufferUse use, size_t stride, size_t count)
 {
    GLGraphicsBufferD* retval = new GLGraphicsBufferD(use, stride * count);
-    m_deferredData->m_DBufs.emplace_back(retval);
+    GLDataFactoryImpl::m_deferredData->m_DBufs.emplace_back(retval);
    return retval;
 }
@@ -1478,7 +1545,7 @@ ITextureD*
 GLDataFactory::Context::newDynamicTexture(size_t width, size_t height, TextureFormat fmt)
 {
    GLTextureD* retval = new GLTextureD(width, height, fmt);
-    m_deferredData->m_DTexs.emplace_back(retval);
+    GLDataFactoryImpl::m_deferredData->m_DTexs.emplace_back(retval);
    return retval;
 }
@@ -1546,11 +1613,12 @@ ITextureR*
 GLDataFactory::Context::newRenderTexture(size_t width, size_t height,
                                         bool enableShaderColorBinding, bool enableShaderDepthBinding)
 {
-    GLCommandQueue* q = static_cast<GLCommandQueue*>(m_parent.m_parent->getCommandQueue());
+    GLDataFactoryImpl& factory = static_cast<GLDataFactoryImpl&>(m_parent);
-    GLTextureR* retval = new GLTextureR(q, width, height, m_parent.m_drawSamples,
+    GLCommandQueue* q = static_cast<GLCommandQueue*>(factory.m_parent->getCommandQueue());
    GLTextureR* retval = new GLTextureR(q, width, height, factory.m_drawSamples,
                                        enableShaderColorBinding, enableShaderDepthBinding);
    q->resizeRenderTexture(retval, width, height);
-    m_deferredData->m_RTexs.emplace_back(retval);
+    GLDataFactoryImpl::m_deferredData->m_RTexs.emplace_back(retval);
    return retval;
 }
@@ -1572,9 +1640,10 @@ IVertexFormat* GLDataFactory::Context::newVertexFormat
 (size_t elementCount, const VertexElementDescriptor* elements,
 size_t baseVert, size_t baseInst)
 {
-    GLCommandQueue* q = static_cast<GLCommandQueue*>(m_parent.m_parent->getCommandQueue());
+    GLDataFactoryImpl& factory = static_cast<GLDataFactoryImpl&>(m_parent);
    GLCommandQueue* q = static_cast<GLCommandQueue*>(factory.m_parent->getCommandQueue());
    GLVertexFormat* retval = new struct GLVertexFormat(q, elementCount, elements, baseVert, baseInst);
-    m_deferredData->m_VFmts.emplace_back(retval);
+    GLDataFactoryImpl::m_deferredData->m_VFmts.emplace_back(retval);
    return retval;
 }
@@ -1583,4 +1652,9 @@ IGraphicsCommandQueue* _NewGLCommandQueue(IGraphicsContext* parent)
    return new struct GLCommandQueue(parent);
 }
 IGraphicsDataFactory* _NewGLDataFactory(IGraphicsContext* parent, uint32_t drawSamples)
 {
    return new class GLDataFactoryImpl(parent, drawSamples);
 }
 }
--- a/lib/graphicsdev/Metal.mm
+++ b/lib/graphicsdev/Metal.mm
@@ -5,6 +5,9 @@
 #include "boo/IGraphicsContext.hpp"
 #include "Common.hpp"
 #include <vector>
 #include <unordered_map>
 #include <unordered_set>
 #include "xxhash.h"
 #if !__has_feature(objc_arc)
 #error ARC Required
@@ -17,8 +20,48 @@ namespace boo
 {
 static logvisor::Module Log("boo::Metal");
 struct MetalCommandQueue;
 class MetalDataFactoryImpl;
-ThreadLocalPtr<struct MetalData> MetalDataFactory::m_deferredData;
+struct MetalShareableShader : IShareableShader<MetalDataFactoryImpl, MetalShareableShader>
 {
    id<MTLFunction> m_shader;
    MetalShareableShader(MetalDataFactoryImpl& fac, uint64_t key, id<MTLFunction> s)
    : IShareableShader(fac, key), m_shader(s) {}
 };
 class MetalDataFactoryImpl : public MetalDataFactory
 {
    friend struct MetalCommandQueue;
    friend class MetalDataFactory::Context;
    IGraphicsContext* m_parent;
    static ThreadLocalPtr<struct MetalData> m_deferredData;
    std::unordered_set<struct MetalData*> m_committedData;
    std::unordered_set<struct MetalPool*> m_committedPools;
    std::mutex m_committedMutex;
    std::unordered_map<uint64_t, std::unique_ptr<MetalShareableShader>> m_sharedShaders;
    struct MetalContext* m_ctx;
    uint32_t m_sampleCount;
    void destroyData(IGraphicsData*);
    void destroyAllData();
    void destroyPool(IGraphicsBufferPool*);
    IGraphicsBufferD* newPoolBuffer(IGraphicsBufferPool* pool, BufferUse use,
                                    size_t stride, size_t count);
    void deletePoolBuffer(IGraphicsBufferPool* p, IGraphicsBufferD* buf);
 public:
    MetalDataFactoryImpl(IGraphicsContext* parent, MetalContext* ctx, uint32_t sampleCount);
    ~MetalDataFactoryImpl() {}
    Platform platform() const {return Platform::Metal;}
    const char* platformName() const {return "Metal";}
    GraphicsDataToken commitTransaction(const std::function<bool(IGraphicsDataFactory::Context& ctx)>&);
    GraphicsBufferPoolToken newBufferPool();
    void _unregisterShareableShader(uint64_t key) { m_sharedShaders.erase(key); }
 };
 ThreadLocalPtr<struct MetalData> MetalDataFactoryImpl::m_deferredData;
 struct MetalData : IGraphicsDataPriv<MetalData>
 {
    std::vector<std::unique_ptr<class MetalShaderPipeline>> m_SPs;
@@ -60,6 +103,7 @@ public:
 class MetalGraphicsBufferD : public IGraphicsBufferD
 {
    friend class MetalDataFactory;
    friend class MetalDataFactoryImpl;
    friend struct MetalCommandQueue;
    MetalCommandQueue* m_q;
    std::unique_ptr<uint8_t[]> m_cpuBuf;
@@ -502,19 +546,24 @@ class MetalShaderPipeline : public IShaderPipeline
    MTLCullMode m_cullMode = MTLCullModeNone;
    MTLPrimitiveType m_drawPrim;
    const MetalVertexFormat* m_vtxFmt;
    MetalShareableShader::Token m_vert;
    MetalShareableShader::Token m_frag;
-    MetalShaderPipeline(MetalContext* ctx, id<MTLFunction> vert, id<MTLFunction> frag,
+    MetalShaderPipeline(MetalContext* ctx,
                        MetalShareableShader::Token&& vert,
                        MetalShareableShader::Token&& frag,
                        const MetalVertexFormat* vtxFmt, NSUInteger targetSamples,
                        BlendFactor srcFac, BlendFactor dstFac, Primitive prim,
                        bool depthTest, bool depthWrite, bool backfaceCulling)
-    : m_drawPrim(PRIMITIVE_TABLE[int(prim)]), m_vtxFmt(vtxFmt)
+    : m_drawPrim(PRIMITIVE_TABLE[int(prim)]), m_vtxFmt(vtxFmt),
      m_vert(std::move(vert)), m_frag(std::move(frag))
    {
        if (backfaceCulling)
            m_cullMode = MTLCullModeBack;
        MTLRenderPipelineDescriptor* desc = [MTLRenderPipelineDescriptor new];
-        desc.vertexFunction = vert;
+        desc.vertexFunction = m_vert.get().m_shader;
-        desc.fragmentFunction = frag;
+        desc.fragmentFunction = m_frag.get().m_shader;
        desc.vertexDescriptor = vtxFmt->m_vdesc;
        desc.sampleCount = targetSamples;
        desc.colorAttachments[0].pixelFormat = MTLPixelFormatBGRA8Unorm;
@@ -897,7 +946,7 @@ struct MetalCommandQueue : IGraphicsCommandQueue
            return;
        /* Update dynamic data here */
-        MetalDataFactory* gfxF = static_cast<MetalDataFactory*>(m_parent->getDataFactory());
+        MetalDataFactoryImpl* gfxF = static_cast<MetalDataFactoryImpl*>(m_parent->getDataFactory());
        std::unique_lock<std::mutex> datalk(gfxF->m_committedMutex);
        for (MetalData* d : gfxF->m_committedData)
        {
@@ -1042,49 +1091,55 @@ void MetalTextureD::unmap()
    m_validSlots = 0;
 }
-MetalDataFactory::MetalDataFactory(IGraphicsContext* parent, MetalContext* ctx, uint32_t sampleCount)
+MetalDataFactoryImpl::MetalDataFactoryImpl(IGraphicsContext* parent, MetalContext* ctx, uint32_t sampleCount)
 : m_parent(parent), m_ctx(ctx), m_sampleCount(sampleCount) {}
 IGraphicsBufferS* MetalDataFactory::Context::newStaticBuffer(BufferUse use, const void* data, size_t stride, size_t count)
 {
-    MetalGraphicsBufferS* retval = new MetalGraphicsBufferS(use, m_parent.m_ctx, data, stride, count);
+    MetalDataFactoryImpl& factory = static_cast<MetalDataFactoryImpl&>(m_parent);
-    m_deferredData->m_SBufs.emplace_back(retval);
+    MetalGraphicsBufferS* retval = new MetalGraphicsBufferS(use, factory.m_ctx, data, stride, count);
    MetalDataFactoryImpl::m_deferredData->m_SBufs.emplace_back(retval);
    return retval;
 }
 IGraphicsBufferD* MetalDataFactory::Context::newDynamicBuffer(BufferUse use, size_t stride, size_t count)
 {
-    MetalCommandQueue* q = static_cast<MetalCommandQueue*>(m_parent.m_parent->getCommandQueue());
+    MetalDataFactoryImpl& factory = static_cast<MetalDataFactoryImpl&>(m_parent);
-    MetalGraphicsBufferD* retval = new MetalGraphicsBufferD(q, use, m_parent.m_ctx, stride, count);
+    MetalCommandQueue* q = static_cast<MetalCommandQueue*>(factory.m_parent->getCommandQueue());
-    m_deferredData->m_DBufs.emplace_back(retval);
+    MetalGraphicsBufferD* retval = new MetalGraphicsBufferD(q, use, factory.m_ctx, stride, count);
    MetalDataFactoryImpl::m_deferredData->m_DBufs.emplace_back(retval);
    return retval;
 }
 ITextureS* MetalDataFactory::Context::newStaticTexture(size_t width, size_t height, size_t mips, TextureFormat fmt,
                                                       const void* data, size_t sz)
 {
-    MetalTextureS* retval = new MetalTextureS(m_parent.m_ctx, width, height, mips, fmt, data, sz);
+    MetalDataFactoryImpl& factory = static_cast<MetalDataFactoryImpl&>(m_parent);
-    m_deferredData->m_STexs.emplace_back(retval);
+    MetalTextureS* retval = new MetalTextureS(factory.m_ctx, width, height, mips, fmt, data, sz);
    MetalDataFactoryImpl::m_deferredData->m_STexs.emplace_back(retval);
    return retval;
 }
 ITextureSA* MetalDataFactory::Context::newStaticArrayTexture(size_t width, size_t height, size_t layers, size_t mips,
                                                             TextureFormat fmt, const void* data, size_t sz)
 {
-    MetalTextureSA* retval = new MetalTextureSA(m_parent.m_ctx, width, height, layers, mips, fmt, data, sz);
+    MetalDataFactoryImpl& factory = static_cast<MetalDataFactoryImpl&>(m_parent);
-    m_deferredData->m_SATexs.emplace_back(retval);
+    MetalTextureSA* retval = new MetalTextureSA(factory.m_ctx, width, height, layers, mips, fmt, data, sz);
    MetalDataFactoryImpl::m_deferredData->m_SATexs.emplace_back(retval);
    return retval;
 }
 ITextureD* MetalDataFactory::Context::newDynamicTexture(size_t width, size_t height, TextureFormat fmt)
 {
-    MetalCommandQueue* q = static_cast<MetalCommandQueue*>(m_parent.m_parent->getCommandQueue());
+    MetalDataFactoryImpl& factory = static_cast<MetalDataFactoryImpl&>(m_parent);
-    MetalTextureD* retval = new MetalTextureD(q, m_parent.m_ctx, width, height, fmt);
+    MetalCommandQueue* q = static_cast<MetalCommandQueue*>(factory.m_parent->getCommandQueue());
-    m_deferredData->m_DTexs.emplace_back(retval);
+    MetalTextureD* retval = new MetalTextureD(q, factory.m_ctx, width, height, fmt);
    MetalDataFactoryImpl::m_deferredData->m_DTexs.emplace_back(retval);
    return retval;
 }
 ITextureR* MetalDataFactory::Context::newRenderTexture(size_t width, size_t height,
                                                       bool enableShaderColorBinding, bool enableShaderDepthBinding)
 {
-    MetalTextureR* retval = new MetalTextureR(m_parent.m_ctx, width, height, m_parent.m_sampleCount, enableShaderColorBinding);
+    MetalDataFactoryImpl& factory = static_cast<MetalDataFactoryImpl&>(m_parent);
-    m_deferredData->m_RTexs.emplace_back(retval);
+    MetalTextureR* retval = new MetalTextureR(factory.m_ctx, width, height, factory.m_sampleCount, enableShaderColorBinding);
    MetalDataFactoryImpl::m_deferredData->m_RTexs.emplace_back(retval);
    return retval;
 }
@@ -1092,7 +1147,7 @@ IVertexFormat* MetalDataFactory::Context::newVertexFormat(size_t elementCount, c
                                                          size_t baseVert, size_t baseInst)
 {
    MetalVertexFormat* retval = new struct MetalVertexFormat(elementCount, elements);
-    m_deferredData->m_VFmts.emplace_back(retval);
+    MetalDataFactoryImpl::m_deferredData->m_VFmts.emplace_back(retval);
    return retval;
 }
@@ -1101,34 +1156,71 @@ IShaderPipeline* MetalDataFactory::Context::newShaderPipeline(const char* vertSo
                                                              BlendFactor srcFac, BlendFactor dstFac, Primitive prim,
                                                              bool depthTest, bool depthWrite, bool backfaceCulling)
 {
    MetalDataFactoryImpl& factory = static_cast<MetalDataFactoryImpl&>(m_parent);
    MTLCompileOptions* compOpts = [MTLCompileOptions new];
    compOpts.languageVersion = MTLLanguageVersion1_1;
    NSError* err = nullptr;
-    id<MTLLibrary> vertShaderLib = [m_parent.m_ctx->m_dev newLibraryWithSource:@(vertSource)
+    XXH64_state_t hashState;
-                                                                       options:compOpts
+    uint64_t hashes[2];
-                                                                         error:&err];
+    XXH64_reset(&hashState, 0);
-    if (!vertShaderLib)
+    XXH64_update(&hashState, vertSource, strlen(vertSource));
-    {
+    hashes[0] = XXH64_digest(&hashState);
-        printf("%s\n", vertSource);
+    XXH64_reset(&hashState, 0);
-        Log.report(logvisor::Fatal, "error compiling vert shader: %s", [[err localizedDescription] UTF8String]);
+    XXH64_update(&hashState, fragSource, strlen(fragSource));
-    }
+    hashes[1] = XXH64_digest(&hashState);
    id<MTLFunction> vertFunc = [vertShaderLib newFunctionWithName:@"vmain"];
-    id<MTLLibrary> fragShaderLib = [m_parent.m_ctx->m_dev newLibraryWithSource:@(fragSource)
+    MetalShareableShader::Token vertShader;
-                                                                       options:compOpts
+    MetalShareableShader::Token fragShader;
-                                                                         error:&err];
+    auto vertFind = factory.m_sharedShaders.find(hashes[0]);
-    if (!fragShaderLib)
+    if (vertFind != factory.m_sharedShaders.end())
    {
-        printf("%s\n", fragSource);
+        vertShader = vertFind->second->lock();
        Log.report(logvisor::Fatal, "error compiling frag shader: %s", [[err localizedDescription] UTF8String]);
    }
-    id<MTLFunction> fragFunc = [fragShaderLib newFunctionWithName:@"fmain"];
+    else
    {
        id<MTLLibrary> vertShaderLib = [factory.m_ctx->m_dev newLibraryWithSource:@(vertSource)
                                                                          options:compOpts
                                                                            error:&err];
        if (!vertShaderLib)
        {
            printf("%s\n", vertSource);
            Log.report(logvisor::Fatal, "error compiling vert shader: %s", [[err localizedDescription] UTF8String]);
        }
        id<MTLFunction> vertFunc = [vertShaderLib newFunctionWithName:@"vmain"];
-    MetalShaderPipeline* retval = new MetalShaderPipeline(m_parent.m_ctx, vertFunc, fragFunc,
+        auto it =
        factory.m_sharedShaders.emplace(std::make_pair(hashes[0],
            std::make_unique<MetalShareableShader>(factory, hashes[0], vertFunc))).first;
        vertShader = it->second->lock();
    }
    auto fragFind = factory.m_sharedShaders.find(hashes[1]);
    if (fragFind != factory.m_sharedShaders.end())
    {
        fragShader = fragFind->second->lock();
    }
    else
    {
        id<MTLLibrary> fragShaderLib = [factory.m_ctx->m_dev newLibraryWithSource:@(fragSource)
                                                                          options:compOpts
                                                                            error:&err];
        if (!fragShaderLib)
        {
            printf("%s\n", fragSource);
            Log.report(logvisor::Fatal, "error compiling frag shader: %s", [[err localizedDescription] UTF8String]);
        }
        id<MTLFunction> fragFunc = [fragShaderLib newFunctionWithName:@"fmain"];
        auto it =
        factory.m_sharedShaders.emplace(std::make_pair(hashes[1],
            std::make_unique<MetalShareableShader>(factory, hashes[1], fragFunc))).first;
        fragShader = it->second->lock();
    }
    MetalShaderPipeline* retval = new MetalShaderPipeline(factory.m_ctx, std::move(vertShader), std::move(fragShader),
                                                          static_cast<const MetalVertexFormat*>(vtxFmt), targetSamples,
                                                          srcFac, dstFac, prim, depthTest, depthWrite, backfaceCulling);
-    m_deferredData->m_SPs.emplace_back(retval);
+    MetalDataFactoryImpl::m_deferredData->m_SPs.emplace_back(retval);
    return retval;
 }
@@ -1140,16 +1232,17 @@ MetalDataFactory::Context::newShaderDataBinding(IShaderPipeline* pipeline,
                                                const size_t* ubufOffs, const size_t* ubufSizes,
                                                size_t texCount, ITexture** texs, size_t baseVert, size_t baseInst)
 {
    MetalDataFactoryImpl& factory = static_cast<MetalDataFactoryImpl&>(m_parent);
    MetalShaderDataBinding* retval =
-    new MetalShaderDataBinding(m_deferredData.get(),
+    new MetalShaderDataBinding(MetalDataFactoryImpl::m_deferredData.get(),
-                               m_parent.m_ctx, pipeline, vbuf, instVbo, ibuf,
+                               factory.m_ctx, pipeline, vbuf, instVbo, ibuf,
                               ubufCount, ubufs, ubufStages, ubufOffs,
                               ubufSizes, texCount, texs, baseVert, baseInst);
-    m_deferredData->m_SBinds.emplace_back(retval);
+    MetalDataFactoryImpl::m_deferredData->m_SBinds.emplace_back(retval);
    return retval;
 }
-GraphicsDataToken MetalDataFactory::commitTransaction(const FactoryCommitFunc& trans)
+GraphicsDataToken MetalDataFactoryImpl::commitTransaction(const FactoryCommitFunc& trans)
 {
    if (m_deferredData.get())
        Log.report(logvisor::Fatal, "nested commitTransaction usage detected");
@@ -1170,7 +1263,7 @@ GraphicsDataToken MetalDataFactory::commitTransaction(const FactoryCommitFunc& t
    return GraphicsDataToken(this, retval);
 }
-GraphicsBufferPoolToken MetalDataFactory::newBufferPool()
+GraphicsBufferPoolToken MetalDataFactoryImpl::newBufferPool()
 {
    std::unique_lock<std::mutex> lk(m_committedMutex);
    MetalPool* retval = new MetalPool;
@@ -1178,7 +1271,7 @@ GraphicsBufferPoolToken MetalDataFactory::newBufferPool()
    return GraphicsBufferPoolToken(this, retval);
 }
-void MetalDataFactory::destroyData(IGraphicsData* d)
+void MetalDataFactoryImpl::destroyData(IGraphicsData* d)
 {
    std::unique_lock<std::mutex> lk(m_committedMutex);
    MetalData* data = static_cast<MetalData*>(d);
@@ -1186,7 +1279,7 @@ void MetalDataFactory::destroyData(IGraphicsData* d)
    data->decrement();
 }
-void MetalDataFactory::destroyAllData()
+void MetalDataFactoryImpl::destroyAllData()
 {
    std::unique_lock<std::mutex> lk(m_committedMutex);
    for (MetalData* data : m_committedData)
@@ -1197,7 +1290,7 @@ void MetalDataFactory::destroyAllData()
    m_committedPools.clear();
 }
-void MetalDataFactory::destroyPool(IGraphicsBufferPool* p)
+void MetalDataFactoryImpl::destroyPool(IGraphicsBufferPool* p)
 {
    std::unique_lock<std::mutex> lk(m_committedMutex);
    MetalPool* pool = static_cast<MetalPool*>(p);
@@ -1205,8 +1298,8 @@ void MetalDataFactory::destroyPool(IGraphicsBufferPool* p)
    delete pool;
 }
-IGraphicsBufferD* MetalDataFactory::newPoolBuffer(IGraphicsBufferPool* p, BufferUse use,
+IGraphicsBufferD* MetalDataFactoryImpl::newPoolBuffer(IGraphicsBufferPool* p, BufferUse use,
-                                                  size_t stride, size_t count)
+                                                      size_t stride, size_t count)
 {
    MetalPool* pool = static_cast<MetalPool*>(p);
    MetalCommandQueue* q = static_cast<MetalCommandQueue*>(m_parent->getCommandQueue());
@@ -1215,7 +1308,7 @@ IGraphicsBufferD* MetalDataFactory::newPoolBuffer(IGraphicsBufferPool* p, Buffer
    return retval;
 }
-void MetalDataFactory::deletePoolBuffer(IGraphicsBufferPool* p, IGraphicsBufferD* buf)
+void MetalDataFactoryImpl::deletePoolBuffer(IGraphicsBufferPool* p, IGraphicsBufferD* buf)
 {
    MetalPool* pool = static_cast<MetalPool*>(p);
    pool->m_DBufs.erase(static_cast<MetalGraphicsBufferD*>(buf));
@@ -1227,6 +1320,11 @@ IGraphicsCommandQueue* _NewMetalCommandQueue(MetalContext* ctx, IWindow* parentW
    return new struct MetalCommandQueue(ctx, parentWindow, parent);
 }
 IGraphicsDataFactory* _NewMetalDataFactory(IGraphicsContext* parent, MetalContext* ctx, uint32_t sampleCount)
 {
    return new class MetalDataFactoryImpl(parent, ctx, sampleCount);
 }
 }
 #endif
--- a/lib/mac/WindowCocoa.mm
+++ b/lib/mac/WindowCocoa.mm
@@ -185,8 +185,11 @@ namespace boo
 {
 static logvisor::Module Log("boo::WindowCocoa");
 IGraphicsCommandQueue* _NewGLCommandQueue(IGraphicsContext* parent);
 IGraphicsDataFactory* _NewGLDataFactory(IGraphicsContext* parent, uint32_t drawSamples);
 IGraphicsCommandQueue* _NewMetalCommandQueue(MetalContext* ctx, IWindow* parentWindow,
                                             IGraphicsContext* parent);
 IGraphicsDataFactory* _NewMetalDataFactory(IGraphicsContext* parent,
                                           MetalContext* ctx, uint32_t sampleCount);
 void _CocoaUpdateLastGLCtx(NSOpenGLContext* lastGLCtx);
 class GraphicsContextCocoaGL : public GraphicsContextCocoa
@@ -206,7 +209,7 @@ public:
    : GraphicsContextCocoa(api, EPixelFormat::RGBA8, parentWindow),
      m_lastCtx(lastGLCtx)
    {
-        m_dataFactory = new GLDataFactory(this, sampleCount);
+        m_dataFactory = _NewGLDataFactory(this, sampleCount);
    }
    ~GraphicsContextCocoaGL()
@@ -362,7 +365,7 @@ public:
    : GraphicsContextCocoa(api, EPixelFormat::RGBA8, parentWindow),
      m_parentWindow(parentWindow), m_metalCtx(metalCtx)
    {
-        m_dataFactory = new MetalDataFactory(this, metalCtx, sampleCount);
+        m_dataFactory = _NewMetalDataFactory(this, metalCtx, sampleCount);
    }
    ~GraphicsContextCocoaMetal()
--- a/lib/win/WindowWin32.cpp
+++ b/lib/win/WindowWin32.cpp
@@ -36,6 +36,7 @@ IGraphicsDataFactory* _NewD3D12DataFactory(D3D12Context* ctx, IGraphicsContext*
 IGraphicsCommandQueue* _NewD3D11CommandQueue(D3D11Context* ctx, D3D11Context::Window* windowCtx, IGraphicsContext* parent);
 IGraphicsDataFactory* _NewD3D11DataFactory(D3D11Context* ctx, IGraphicsContext* parent, uint32_t sampleCount);
 IGraphicsCommandQueue* _NewGLCommandQueue(IGraphicsContext* parent);
 IGraphicsDataFactory* _NewGLDataFactory(IGraphicsContext* parent, uint32_t drawSamples);
 #if BOO_HAS_VULKAN
 IGraphicsCommandQueue* _NewVulkanCommandQueue(VulkanContext* ctx,
                                              VulkanContext::Window* windowCtx,
@@ -280,7 +281,7 @@ public:
                Log.report(logvisor::Fatal, "unable to share contexts");
        m_3dCtx.m_ctxOgl.m_lastContext = w.m_mainContext;
-        m_dataFactory = new GLDataFactory(this, sampleCount);
+        m_dataFactory = _NewGLDataFactory(this, sampleCount);
        m_commandQueue = _NewGLCommandQueue(this);
    }
--- a/lib/x11/WindowXlib.cpp
+++ b/lib/x11/WindowXlib.cpp
@@ -114,6 +114,7 @@ namespace boo
 {
 static logvisor::Module Log("boo::WindowXlib");
 IGraphicsCommandQueue* _NewGLCommandQueue(IGraphicsContext* parent);
 IGraphicsDataFactory* _NewGLDataFactory(IGraphicsContext* parent, uint32_t drawSamples);
 #if BOO_HAS_VULKAN
 IGraphicsCommandQueue* _NewVulkanCommandQueue(VulkanContext* ctx,
                                              VulkanContext::Window* windowCtx,
@@ -327,7 +328,7 @@ public:
    : GraphicsContextXlib(api, EPixelFormat::RGBA8, parentWindow, display, drawSamples),
      m_lastCtx(lastCtx)
    {
-        m_dataFactory = new class GLDataFactory(this, drawSamples);
+        m_dataFactory = _NewGLDataFactory(this, drawSamples);
        /* Query framebuffer configurations */
        GLXFBConfig* fbConfigs = nullptr;
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_executable(booTest WIN32 main.cpp)
-target_link_libraries(booTest boo logvisor ${BOO_SYS_LIBS})
+target_link_libraries(booTest boo logvisor xxhash ${BOO_SYS_LIBS})
--- a/xxhash/CMakeLists.txt
+++ b/xxhash/CMakeLists.txt
@@ -0,0 +1 @@
 add_library(xxhash xxhash.c xxhash.h)
--- a/xxhash/LICENSE
+++ b/xxhash/LICENSE
@@ -0,0 +1,24 @@
 xxHash Library
 Copyright (c) 2012-2014, Yann Collet
 All rights reserved.
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
 * Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice, this
  list of conditions and the following disclaimer in the documentation and/or
  other materials provided with the distribution.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/xxhash/xxhash.c
+++ b/xxhash/xxhash.c
@@ -0,0 +1,962 @@
 /*
 xxHash - Fast Hash algorithm
 Copyright (C) 2012-2015, Yann Collet
 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 * Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above
 copyright notice, this list of conditions and the following disclaimer
 in the documentation and/or other materials provided with the
 distribution.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 You can contact the author at :
 - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 /**************************************
 *  Tuning parameters
 **************************************/
 /* XXH_FORCE_MEMORY_ACCESS
 * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
 * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
 * The below switch allow to select different access method for improved performance.
 * Method 0 (default) : use `memcpy()`. Safe and portable.
 * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
 *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
 * Method 2 : direct access. This method is portable but violate C standard.
 *            It can generate buggy code on targets which generate assembly depending on alignment.
 *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
 * See http://stackoverflow.com/a/32095106/646947 for details.
 * Prefer these methods in priority order (0 > 1 > 2)
 */
 #ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
 #  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
 #    define XXH_FORCE_MEMORY_ACCESS 2
 #  elif defined(__INTEL_COMPILER) || \
  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
 #    define XXH_FORCE_MEMORY_ACCESS 1
 #  endif
 #endif
 /* XXH_ACCEPT_NULL_INPUT_POINTER :
 * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
 * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
 * By default, this option is disabled. To enable it, uncomment below define :
 */
 /* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
 /* XXH_FORCE_NATIVE_FORMAT :
 * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
 * Results are therefore identical for little-endian and big-endian CPU.
 * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
 * Should endian-independance be of no importance for your application, you may set the #define below to 1,
 * to improve speed for Big-endian CPU.
 * This option has no impact on Little_Endian CPU.
 */
 #define XXH_FORCE_NATIVE_FORMAT 0
 /* XXH_USELESS_ALIGN_BRANCH :
 * This is a minor performance trick, only useful with lots of very small keys.
 * It means : don't make a test between aligned/unaligned, because performance will be the same.
 * It saves one initial branch per hash.
 */
 #if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
 #  define XXH_USELESS_ALIGN_BRANCH 1
 #endif
 /**************************************
 *  Compiler Specific Options
 ***************************************/
 #ifdef _MSC_VER    /* Visual Studio */
 #  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
 #  define FORCE_INLINE static __forceinline
 #else
 #  if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
 #    ifdef __GNUC__
 #      define FORCE_INLINE static inline __attribute__((always_inline))
 #    else
 #      define FORCE_INLINE static inline
 #    endif
 #  else
 #    define FORCE_INLINE static
 #  endif /* __STDC_VERSION__ */
 #endif
 /**************************************
 *  Includes & Memory related functions
 ***************************************/
 #include "xxhash.h"
 /* Modify the local functions below should you wish to use some other memory routines */
 /* for malloc(), free() */
 #include <stdlib.h>
 static void* XXH_malloc(size_t s) { return malloc(s); }
 static void  XXH_free  (void* p)  { free(p); }
 /* for memcpy() */
 #include <string.h>
 static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
 /**************************************
 *  Basic Types
 ***************************************/
 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
 # include <stdint.h>
  typedef uint8_t  BYTE;
  typedef uint16_t U16;
  typedef uint32_t U32;
  typedef  int32_t S32;
  typedef uint64_t U64;
 #else
  typedef unsigned char      BYTE;
  typedef unsigned short     U16;
  typedef unsigned int       U32;
  typedef   signed int       S32;
  typedef unsigned long long U64;
 #endif
 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
 /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
 static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
 static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
 /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
 /* currently only defined for gcc and icc */
 typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
 static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
 static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
 #else
 /* portable and safe solution. Generally efficient.
 * see : http://stackoverflow.com/a/32095106/646947
 */
 static U32 XXH_read32(const void* memPtr)
 {
    U32 val;
    memcpy(&val, memPtr, sizeof(val));
    return val;
 }
 static U64 XXH_read64(const void* memPtr)
 {
    U64 val;
    memcpy(&val, memPtr, sizeof(val));
    return val;
 }
 #endif // XXH_FORCE_DIRECT_MEMORY_ACCESS
 /******************************************
 *  Compiler-specific Functions and Macros
 ******************************************/
 #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 /* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
 #if defined(_MSC_VER)
 #  define XXH_rotl32(x,r) _rotl(x,r)
 #  define XXH_rotl64(x,r) _rotl64(x,r)
 #else
 #  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
 #  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
 #endif
 #if defined(_MSC_VER)     /* Visual Studio */
 #  define XXH_swap32 _byteswap_ulong
 #  define XXH_swap64 _byteswap_uint64
 #elif GCC_VERSION >= 403
 #  define XXH_swap32 __builtin_bswap32
 #  define XXH_swap64 __builtin_bswap64
 #else
 static U32 XXH_swap32 (U32 x)
 {
    return  ((x << 24) & 0xff000000 ) |
            ((x <<  8) & 0x00ff0000 ) |
            ((x >>  8) & 0x0000ff00 ) |
            ((x >> 24) & 0x000000ff );
 }
 static U64 XXH_swap64 (U64 x)
 {
    return  ((x << 56) & 0xff00000000000000ULL) |
            ((x << 40) & 0x00ff000000000000ULL) |
            ((x << 24) & 0x0000ff0000000000ULL) |
            ((x << 8)  & 0x000000ff00000000ULL) |
            ((x >> 8)  & 0x00000000ff000000ULL) |
            ((x >> 24) & 0x0000000000ff0000ULL) |
            ((x >> 40) & 0x000000000000ff00ULL) |
            ((x >> 56) & 0x00000000000000ffULL);
 }
 #endif
 /***************************************
 *  Architecture Macros
 ***************************************/
 typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
 /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example one the compiler command line */
 #ifndef XXH_CPU_LITTLE_ENDIAN
    static const int one = 1;
 #   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&one))
 #endif
 /*****************************
 *  Memory reads
 *****************************/
 typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
 FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
 {
    if (align==XXH_unaligned)
        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
    else
        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
 }
 FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
 {
    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
 }
 FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
 {
    if (align==XXH_unaligned)
        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
    else
        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
 }
 FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
 {
    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
 }
 /***************************************
 *  Macros
 ***************************************/
 #define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    /* use only *after* variable declarations */
 /***************************************
 *  Constants
 ***************************************/
 #define PRIME32_1   2654435761U
 #define PRIME32_2   2246822519U
 #define PRIME32_3   3266489917U
 #define PRIME32_4    668265263U
 #define PRIME32_5    374761393U
 #define PRIME64_1 11400714785074694791ULL
 #define PRIME64_2 14029467366897019727ULL
 #define PRIME64_3  1609587929392839161ULL
 #define PRIME64_4  9650029242287828579ULL
 #define PRIME64_5  2870177450012600261ULL
 /*****************************
 *  Simple Hash Functions
 *****************************/
 FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
 {
    const BYTE* p = (const BYTE*)input;
    const BYTE* bEnd = p + len;
    U32 h32;
 #define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
 #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
    if (p==NULL)
    {
        len=0;
        bEnd=p=(const BYTE*)(size_t)16;
    }
 #endif
    if (len>=16)
    {
        const BYTE* const limit = bEnd - 16;
        U32 v1 = seed + PRIME32_1 + PRIME32_2;
        U32 v2 = seed + PRIME32_2;
        U32 v3 = seed + 0;
        U32 v4 = seed - PRIME32_1;
        do
        {
            v1 += XXH_get32bits(p) * PRIME32_2;
            v1 = XXH_rotl32(v1, 13);
            v1 *= PRIME32_1;
            p+=4;
            v2 += XXH_get32bits(p) * PRIME32_2;
            v2 = XXH_rotl32(v2, 13);
            v2 *= PRIME32_1;
            p+=4;
            v3 += XXH_get32bits(p) * PRIME32_2;
            v3 = XXH_rotl32(v3, 13);
            v3 *= PRIME32_1;
            p+=4;
            v4 += XXH_get32bits(p) * PRIME32_2;
            v4 = XXH_rotl32(v4, 13);
            v4 *= PRIME32_1;
            p+=4;
        }
        while (p<=limit);
        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
    }
    else
    {
        h32  = seed + PRIME32_5;
    }
    h32 += (U32) len;
    while (p+4<=bEnd)
    {
        h32 += XXH_get32bits(p) * PRIME32_3;
        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
        p+=4;
    }
    while (p<bEnd)
    {
        h32 += (*p) * PRIME32_5;
        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
        p++;
    }
    h32 ^= h32 >> 15;
    h32 *= PRIME32_2;
    h32 ^= h32 >> 13;
    h32 *= PRIME32_3;
    h32 ^= h32 >> 16;
    return h32;
 }
 unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
 {
 #if 0
    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
    XXH32_state_t state;
    XXH32_reset(&state, seed);
    XXH32_update(&state, input, len);
    return XXH32_digest(&state);
 #else
    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 #  if !defined(XXH_USELESS_ALIGN_BRANCH)
    if ((((size_t)input) & 3) == 0)   /* Input is 4-bytes aligned, leverage the speed benefit */
    {
        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
        else
            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
    }
 #  endif
    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
    else
        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
 #endif
 }
 FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
 {
    const BYTE* p = (const BYTE*)input;
    const BYTE* bEnd = p + len;
    U64 h64;
 #define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
 #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
    if (p==NULL)
    {
        len=0;
        bEnd=p=(const BYTE*)(size_t)32;
    }
 #endif
    if (len>=32)
    {
        const BYTE* const limit = bEnd - 32;
        U64 v1 = seed + PRIME64_1 + PRIME64_2;
        U64 v2 = seed + PRIME64_2;
        U64 v3 = seed + 0;
        U64 v4 = seed - PRIME64_1;
        do
        {
            v1 += XXH_get64bits(p) * PRIME64_2;
            p+=8;
            v1 = XXH_rotl64(v1, 31);
            v1 *= PRIME64_1;
            v2 += XXH_get64bits(p) * PRIME64_2;
            p+=8;
            v2 = XXH_rotl64(v2, 31);
            v2 *= PRIME64_1;
            v3 += XXH_get64bits(p) * PRIME64_2;
            p+=8;
            v3 = XXH_rotl64(v3, 31);
            v3 *= PRIME64_1;
            v4 += XXH_get64bits(p) * PRIME64_2;
            p+=8;
            v4 = XXH_rotl64(v4, 31);
            v4 *= PRIME64_1;
        }
        while (p<=limit);
        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
        v1 *= PRIME64_2;
        v1 = XXH_rotl64(v1, 31);
        v1 *= PRIME64_1;
        h64 ^= v1;
        h64 = h64 * PRIME64_1 + PRIME64_4;
        v2 *= PRIME64_2;
        v2 = XXH_rotl64(v2, 31);
        v2 *= PRIME64_1;
        h64 ^= v2;
        h64 = h64 * PRIME64_1 + PRIME64_4;
        v3 *= PRIME64_2;
        v3 = XXH_rotl64(v3, 31);
        v3 *= PRIME64_1;
        h64 ^= v3;
        h64 = h64 * PRIME64_1 + PRIME64_4;
        v4 *= PRIME64_2;
        v4 = XXH_rotl64(v4, 31);
        v4 *= PRIME64_1;
        h64 ^= v4;
        h64 = h64 * PRIME64_1 + PRIME64_4;
    }
    else
    {
        h64  = seed + PRIME64_5;
    }
    h64 += (U64) len;
    while (p+8<=bEnd)
    {
        U64 k1 = XXH_get64bits(p);
        k1 *= PRIME64_2;
        k1 = XXH_rotl64(k1,31);
        k1 *= PRIME64_1;
        h64 ^= k1;
        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
        p+=8;
    }
    if (p+4<=bEnd)
    {
        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
        p+=4;
    }
    while (p<bEnd)
    {
        h64 ^= (*p) * PRIME64_5;
        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
        p++;
    }
    h64 ^= h64 >> 33;
    h64 *= PRIME64_2;
    h64 ^= h64 >> 29;
    h64 *= PRIME64_3;
    h64 ^= h64 >> 32;
    return h64;
 }
 unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
 {
 #if 0
    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
    XXH64_state_t state;
    XXH64_reset(&state, seed);
    XXH64_update(&state, input, len);
    return XXH64_digest(&state);
 #else
    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 #  if !defined(XXH_USELESS_ALIGN_BRANCH)
    if ((((size_t)input) & 7)==0)   /* Input is aligned, let's leverage the speed advantage */
    {
        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
            return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
        else
            return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
    }
 #  endif
    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
    else
        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
 #endif
 }
 /****************************************************
 *  Advanced Hash Functions
 ****************************************************/
 /*** Allocation ***/
 typedef struct
 {
    U64 total_len;
    U32 seed;
    U32 v1;
    U32 v2;
    U32 v3;
    U32 v4;
    U32 mem32[4];   /* defined as U32 for alignment */
    U32 memsize;
 } XXH_istate32_t;
 typedef struct
 {
    U64 total_len;
    U64 seed;
    U64 v1;
    U64 v2;
    U64 v3;
    U64 v4;
    U64 mem64[4];   /* defined as U64 for alignment */
    U32 memsize;
 } XXH_istate64_t;
 XXH32_state_t* XXH32_createState(void)
 {
    XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t));   /* A compilation error here means XXH32_state_t is not large enough */
    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
 }
 XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
 {
    XXH_free(statePtr);
    return XXH_OK;
 }
 XXH64_state_t* XXH64_createState(void)
 {
    XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t));   /* A compilation error here means XXH64_state_t is not large enough */
    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
 }
 XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
 {
    XXH_free(statePtr);
    return XXH_OK;
 }
 /*** Hash feed ***/
 XXH_errorcode XXH32_reset(XXH32_state_t* state_in, unsigned int seed)
 {
    XXH_istate32_t* state = (XXH_istate32_t*) state_in;
    state->seed = seed;
    state->v1 = seed + PRIME32_1 + PRIME32_2;
    state->v2 = seed + PRIME32_2;
    state->v3 = seed + 0;
    state->v4 = seed - PRIME32_1;
    state->total_len = 0;
    state->memsize = 0;
    return XXH_OK;
 }
 XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed)
 {
    XXH_istate64_t* state = (XXH_istate64_t*) state_in;
    state->seed = seed;
    state->v1 = seed + PRIME64_1 + PRIME64_2;
    state->v2 = seed + PRIME64_2;
    state->v3 = seed + 0;
    state->v4 = seed - PRIME64_1;
    state->total_len = 0;
    state->memsize = 0;
    return XXH_OK;
 }
 FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
 {
    XXH_istate32_t* state = (XXH_istate32_t *) state_in;
    const BYTE* p = (const BYTE*)input;
    const BYTE* const bEnd = p + len;
 #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
    if (input==NULL) return XXH_ERROR;
 #endif
    state->total_len += len;
    if (state->memsize + len < 16)   /* fill in tmp buffer */
    {
        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
        state->memsize += (U32)len;
        return XXH_OK;
    }
    if (state->memsize)   /* some data left from previous update */
    {
        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
        {
            const U32* p32 = state->mem32;
            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2;
            state->v1 = XXH_rotl32(state->v1, 13);
            state->v1 *= PRIME32_1;
            p32++;
            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2;
            state->v2 = XXH_rotl32(state->v2, 13);
            state->v2 *= PRIME32_1;
            p32++;
            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2;
            state->v3 = XXH_rotl32(state->v3, 13);
            state->v3 *= PRIME32_1;
            p32++;
            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2;
            state->v4 = XXH_rotl32(state->v4, 13);
            state->v4 *= PRIME32_1;
            p32++;
        }
        p += 16-state->memsize;
        state->memsize = 0;
    }
    if (p <= bEnd-16)
    {
        const BYTE* const limit = bEnd - 16;
        U32 v1 = state->v1;
        U32 v2 = state->v2;
        U32 v3 = state->v3;
        U32 v4 = state->v4;
        do
        {
            v1 += XXH_readLE32(p, endian) * PRIME32_2;
            v1 = XXH_rotl32(v1, 13);
            v1 *= PRIME32_1;
            p+=4;
            v2 += XXH_readLE32(p, endian) * PRIME32_2;
            v2 = XXH_rotl32(v2, 13);
            v2 *= PRIME32_1;
            p+=4;
            v3 += XXH_readLE32(p, endian) * PRIME32_2;
            v3 = XXH_rotl32(v3, 13);
            v3 *= PRIME32_1;
            p+=4;
            v4 += XXH_readLE32(p, endian) * PRIME32_2;
            v4 = XXH_rotl32(v4, 13);
            v4 *= PRIME32_1;
            p+=4;
        }
        while (p<=limit);
        state->v1 = v1;
        state->v2 = v2;
        state->v3 = v3;
        state->v4 = v4;
    }
    if (p < bEnd)
    {
        XXH_memcpy(state->mem32, p, bEnd-p);
        state->memsize = (int)(bEnd-p);
    }
    return XXH_OK;
 }
 XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
 {
    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
    else
        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
 }
 FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian)
 {
    const XXH_istate32_t* state = (const XXH_istate32_t*) state_in;
    const BYTE * p = (const BYTE*)state->mem32;
    const BYTE* bEnd = (const BYTE*)(state->mem32) + state->memsize;
    U32 h32;
    if (state->total_len >= 16)
    {
        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
    }
    else
    {
        h32  = state->seed + PRIME32_5;
    }
    h32 += (U32) state->total_len;
    while (p+4<=bEnd)
    {
        h32 += XXH_readLE32(p, endian) * PRIME32_3;
        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
        p+=4;
    }
    while (p<bEnd)
    {
        h32 += (*p) * PRIME32_5;
        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
        p++;
    }
    h32 ^= h32 >> 15;
    h32 *= PRIME32_2;
    h32 ^= h32 >> 13;
    h32 *= PRIME32_3;
    h32 ^= h32 >> 16;
    return h32;
 }
 unsigned int XXH32_digest (const XXH32_state_t* state_in)
 {
    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
        return XXH32_digest_endian(state_in, XXH_littleEndian);
    else
        return XXH32_digest_endian(state_in, XXH_bigEndian);
 }
 FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
 {
    XXH_istate64_t * state = (XXH_istate64_t *) state_in;
    const BYTE* p = (const BYTE*)input;
    const BYTE* const bEnd = p + len;
 #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
    if (input==NULL) return XXH_ERROR;
 #endif
    state->total_len += len;
    if (state->memsize + len < 32)   /* fill in tmp buffer */
    {
        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
        state->memsize += (U32)len;
        return XXH_OK;
    }
    if (state->memsize)   /* some data left from previous update */
    {
        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
        {
            const U64* p64 = state->mem64;
            state->v1 += XXH_readLE64(p64, endian) * PRIME64_2;
            state->v1 = XXH_rotl64(state->v1, 31);
            state->v1 *= PRIME64_1;
            p64++;
            state->v2 += XXH_readLE64(p64, endian) * PRIME64_2;
            state->v2 = XXH_rotl64(state->v2, 31);
            state->v2 *= PRIME64_1;
            p64++;
            state->v3 += XXH_readLE64(p64, endian) * PRIME64_2;
            state->v3 = XXH_rotl64(state->v3, 31);
            state->v3 *= PRIME64_1;
            p64++;
            state->v4 += XXH_readLE64(p64, endian) * PRIME64_2;
            state->v4 = XXH_rotl64(state->v4, 31);
            state->v4 *= PRIME64_1;
            p64++;
        }
        p += 32-state->memsize;
        state->memsize = 0;
    }
    if (p+32 <= bEnd)
    {
        const BYTE* const limit = bEnd - 32;
        U64 v1 = state->v1;
        U64 v2 = state->v2;
        U64 v3 = state->v3;
        U64 v4 = state->v4;
        do
        {
            v1 += XXH_readLE64(p, endian) * PRIME64_2;
            v1 = XXH_rotl64(v1, 31);
            v1 *= PRIME64_1;
            p+=8;
            v2 += XXH_readLE64(p, endian) * PRIME64_2;
            v2 = XXH_rotl64(v2, 31);
            v2 *= PRIME64_1;
            p+=8;
            v3 += XXH_readLE64(p, endian) * PRIME64_2;
            v3 = XXH_rotl64(v3, 31);
            v3 *= PRIME64_1;
            p+=8;
            v4 += XXH_readLE64(p, endian) * PRIME64_2;
            v4 = XXH_rotl64(v4, 31);
            v4 *= PRIME64_1;
            p+=8;
        }
        while (p<=limit);
        state->v1 = v1;
        state->v2 = v2;
        state->v3 = v3;
        state->v4 = v4;
    }
    if (p < bEnd)
    {
        XXH_memcpy(state->mem64, p, bEnd-p);
        state->memsize = (int)(bEnd-p);
    }
    return XXH_OK;
 }
 XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
 {
    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
    else
        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
 }
 FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian)
 {
    const XXH_istate64_t * state = (const XXH_istate64_t *) state_in;
    const BYTE * p = (const BYTE*)state->mem64;
    const BYTE* bEnd = (const BYTE*)state->mem64 + state->memsize;
    U64 h64;
    if (state->total_len >= 32)
    {
        U64 v1 = state->v1;
        U64 v2 = state->v2;
        U64 v3 = state->v3;
        U64 v4 = state->v4;
        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
        v1 *= PRIME64_2;
        v1 = XXH_rotl64(v1, 31);
        v1 *= PRIME64_1;
        h64 ^= v1;
        h64 = h64*PRIME64_1 + PRIME64_4;
        v2 *= PRIME64_2;
        v2 = XXH_rotl64(v2, 31);
        v2 *= PRIME64_1;
        h64 ^= v2;
        h64 = h64*PRIME64_1 + PRIME64_4;
        v3 *= PRIME64_2;
        v3 = XXH_rotl64(v3, 31);
        v3 *= PRIME64_1;
        h64 ^= v3;
        h64 = h64*PRIME64_1 + PRIME64_4;
        v4 *= PRIME64_2;
        v4 = XXH_rotl64(v4, 31);
        v4 *= PRIME64_1;
        h64 ^= v4;
        h64 = h64*PRIME64_1 + PRIME64_4;
    }
    else
    {
        h64  = state->seed + PRIME64_5;
    }
    h64 += (U64) state->total_len;
    while (p+8<=bEnd)
    {
        U64 k1 = XXH_readLE64(p, endian);
        k1 *= PRIME64_2;
        k1 = XXH_rotl64(k1,31);
        k1 *= PRIME64_1;
        h64 ^= k1;
        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
        p+=8;
    }
    if (p+4<=bEnd)
    {
        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
        p+=4;
    }
    while (p<bEnd)
    {
        h64 ^= (*p) * PRIME64_5;
        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
        p++;
    }
    h64 ^= h64 >> 33;
    h64 *= PRIME64_2;
    h64 ^= h64 >> 29;
    h64 *= PRIME64_3;
    h64 ^= h64 >> 32;
    return h64;
 }
 unsigned long long XXH64_digest (const XXH64_state_t* state_in)
 {
    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
        return XXH64_digest_endian(state_in, XXH_littleEndian);
    else
        return XXH64_digest_endian(state_in, XXH_bigEndian);
 }
--- a/xxhash/xxhash.h
+++ b/xxhash/xxhash.h
@@ -0,0 +1,192 @@
 /*
   xxHash - Extremely Fast Hash algorithm
   Header File
   Copyright (C) 2012-2015, Yann Collet.
   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
   met:
       * Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
       * Redistributions in binary form must reproduce the above
   copyright notice, this list of conditions and the following disclaimer
   in the documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   You can contact the author at :
   - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 /* Notice extracted from xxHash homepage :
 xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
 It also successfully passes all tests from the SMHasher suite.
 Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
 Name            Speed       Q.Score   Author
 xxHash          5.4 GB/s     10
 CrapWow         3.2 GB/s      2       Andrew
 MumurHash 3a    2.7 GB/s     10       Austin Appleby
 SpookyHash      2.0 GB/s     10       Bob Jenkins
 SBox            1.4 GB/s      9       Bret Mulvey
 Lookup3         1.2 GB/s      9       Bob Jenkins
 SuperFastHash   1.2 GB/s      1       Paul Hsieh
 CityHash64      1.05 GB/s    10       Pike & Alakuijala
 FNV             0.55 GB/s     5       Fowler, Noll, Vo
 CRC32           0.43 GB/s     9
 MD5-32          0.33 GB/s    10       Ronald L. Rivest
 SHA1-32         0.28 GB/s    10
 Q.Score is a measure of quality of the hash function.
 It depends on successfully passing SMHasher test set.
 10 is a perfect score.
 A 64-bits version, named XXH64, is available since r35.
 It offers much better speed, but for 64-bits applications only.
 Name     Speed on 64 bits    Speed on 32 bits
 XXH64       13.8 GB/s            1.9 GB/s
 XXH32        6.8 GB/s            6.0 GB/s
 */
 #pragma once
 #if defined (__cplusplus)
 extern "C" {
 #endif
 /*****************************
 *  Definitions
 *****************************/
 #include <stddef.h>   /* size_t */
 typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 /*****************************
 *  Namespace Emulation
 *****************************/
 /* Motivations :
 If you need to include xxHash into your library,
 but wish to avoid xxHash symbols to be present on your library interface
 in an effort to avoid potential name collision if another library also includes xxHash,
 you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash
 with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values).
 Note that no change is required within the calling program :
 it can still call xxHash functions using their regular name.
 They will be automatically translated by this header.
 */
 #ifdef XXH_NAMESPACE
 #  define XXH_CAT(A,B) A##B
 #  define XXH_NAME2(A,B) XXH_CAT(A,B)
 #  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
 #  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
 #  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
 #  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
 #  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
 #  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
 #  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
 #  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
 #  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
 #  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
 #  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
 #  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
 #endif
 /*****************************
 *  Simple Hash Functions
 *****************************/
 unsigned int       XXH32 (const void* input, size_t length, unsigned seed);
 unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed);
 /*
 XXH32() :
    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
    The memory between input & input+length must be valid (allocated and read-accessible).
    "seed" can be used to alter the result predictably.
    This function successfully passes all SMHasher tests.
    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
 XXH64() :
    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
    Faster on 64-bits systems. Slower on 32-bits systems.
 */
 /*****************************
 *  Advanced Hash Functions
 *****************************/
 typedef struct { long long ll[ 6]; } XXH32_state_t;
 typedef struct { long long ll[11]; } XXH64_state_t;
 /*
 These structures allow static allocation of XXH states.
 States must then be initialized using XXHnn_reset() before first use.
 If you prefer dynamic allocation, please refer to functions below.
 */
 XXH32_state_t* XXH32_createState(void);
 XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
 XXH64_state_t* XXH64_createState(void);
 XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
 /*
 These functions create and release memory for XXH state.
 States must then be initialized using XXHnn_reset() before first use.
 */
 XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
 XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
 unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
 XXH_errorcode      XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
 XXH_errorcode      XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
 unsigned long long XXH64_digest (const XXH64_state_t* statePtr);
 /*
 These functions calculate the xxHash of an input provided in multiple smaller packets,
 as opposed to an input provided as a single block.
 XXH state space must first be allocated, using either static or dynamic method provided above.
 Start a new hash by initializing state with a seed, using XXHnn_reset().
 Then, feed the hash state by calling XXHnn_update() as many times as necessary.
 Obviously, input must be valid, meaning allocated and read accessible.
 The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
 Finally, you can produce a hash anytime, by using XXHnn_digest().
 This function returns the final nn-bits hash.
 You can nonetheless continue feeding the hash state with more input,
 and therefore get some new hashes, by calling again XXHnn_digest().
 When you are done, don't forget to free XXH state space, using typically XXHnn_freeState().
 */
 #if defined (__cplusplus)
 }
 #endif
`@@ -1,2 +1,2 @@`
	`add_executable(booTest WIN32 main.cpp)`	`add_executable(booTest WIN32 main.cpp)`
	`target_link_libraries(booTest boo logvisor ${BOO_SYS_LIBS})`	`target_link_libraries(booTest boo logvisor xxhash ${BOO_SYS_LIBS})`