Оптимизация переносимых 128-битных целочисленных сдвигов на 32-битной архитектуре

В свободное время я работал над утилитарной библиотекой, которая, помимо прочего, поддерживает 128-битные целые числа со знаком / без знака. Эта библиотека в некоторых случаях использует диспетчеризацию cpu для использования инструкций simd, но требует переносного запасного варианта, поэтому она будет работать везде. Совсем недавно я реализовал переносной запасной вариант для 128-битного сдвига. Он работает правильно и работает достаточно быстро, но не так быстро, как хотелось бы, особенно на 32-битных архитектурах.

Вот урезанная версия со всеми соответствующими типами и функциональностью (64-разрядная версия включена для полноты):

typedef uint32_t UInt32;
typedef int32_t Int32;
typedef uint64_t UInt64;
typedef int64_t Int64;

// Returns 0xFFFFFFFF if value != 0, otherwise returns 0.
UInt32 AllOrNothingMask32(Int32 value)
{
    return UInt32(-Int32(value != 0));
}

struct alignas(16) UInt128
{     
    // Ensure the layout matches the architecture.
    // LE = little endian
    // BE = big endian
#if CPU_TYPE == CPU_LE32
    UInt32 mLow;
    UInt32 mLowMid;
    UInt32 mHighMid;
    UInt32 mHigh;
#elif CPU_TYPE == CPU_BE32
    UInt32 mHigh;
    UInt32 mHighMid;
    UInt32 mLowMid;
    UInt32 mLow;
#elif CPU_TYPE == CPU_LE64
    UInt64 mLow;
    UInt64 mHigh;
#elif CPU_TYPE == CPU_BE64
    UInt64 mHigh;
    UInt64 mLow;
#endif

    UInt128() = default;

    UInt128& operator=(const UInt128& other) = default;

    inline
    UInt128(UInt32 high, UInt32 highMid, UInt32 lowMid, UInt32 low) :
#if CPU_SIZE == CPU_32BIT
        mLow(low),
        mLowMid(lowMid),
        mHighMid(highMid),
        mHigh(high) { }
#elif CPU_SIZE == CPU_64BIT
        mLow((UInt64(lowMid) << 32) | low),
        mHigh((UInt64(high) << 32) | highMid) { }
#endif

    inline
    UInt128(UInt64 high, UInt64 low) :
#if CPU_SIZE == CPU_32BIT
        mLow(UInt32(low)),
        mLowMid(UInt32(low >> 32)),
        mHighMid(UInt32(high)),
        mHigh(UInt32(high >> 32)) { }
#elif CPU_SIZE == CPU_64BIT
        mLow(low),
        mHigh(high) { }
#endif

    inline
    bool UInt128::operator==(const UInt128& other) const noexcept
    {
#if CPU_TYPE == CPU_32BIT
        return mLow == other.mLow &&
               mLowMid == other.mLowMid &&
               mHighMid == other.mHighMid &&
               mHigh == other.mHigh;
#elif CPU_TYPE == CPU_64BIT
        return mLow == other.mLow &&
               mHigh == other.mHigh;
#endif
    }

    inline
    UInt128& UInt128::operator<<=(Int32 shift) noexcept
    {
        // Shift is modulo 128, effectively clamping it between 0-127.
        shift &= 0x7F;
#if CPU_SIZE == CPU_32BIT
        auto low = mLow;
        auto lowMid = mLowMid;
        auto highMid = mHighMid;
        auto high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 32) {
            auto rshift = 32 - shift;
            mLow = (low << shift);
            mLowMid = (lowMid << shift) | (low >> rshift);
            mHighMid = (highMid << shift) | (lowMid >> rshift);
            mHigh = (high << shift) | (highMid >> rshift);
        } else if (shift < 64) {
            auto lshift = (shift - 32);
            auto rshift = (32 - lshift) & 0x1F;
            auto rshiftMask = AllOrNothingMask32(rshift);
            mLow = 0;
            mLowMid = (low << lshift);
            mHighMid = (lowMid << lshift) | ((low >> rshift) & rshiftMask);
            mHigh = (highMid << lshift) | ((lowMid >> rshift) & rshiftMask);
        } else if (shift < 96) {
            auto lshift = (shift - 64);
            auto rshift = (64 - lshift) & 0x1F;
            auto rshiftMask = AllOrNothingMask32(rshift);
            mLow = 0;
            mLowMid = 0;
            mHighMid = (low << lshift);
            mHigh = (lowMid << lshift) | ((low >> rshift) & rshiftMask);
        } else {
            mLow = 0;
            mLowMid = 0;
            mHighMid = 0;
            mHigh = (low << (shift - 96));
        }
#elif CPU_SIZE == CPU_64BIT
        auto low = mLow,
             high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 64) {
            mLow = (low << shift);
            mHigh = (high << shift) | (low >> (64 - shift));
        } else {
            mLow = 0;
            mHigh = (low << (shift - 64));
        }
#endif
        return *this;
    }

    inline
    UInt128& UInt128::operator>>=(Int32 shift) noexcept
    {
        // Shift is modulo 128, effectively clamping it between 0-127.
        shift &= 0x7F;
#if CPU_SIZE == CPU_32BIT
        auto low = mLow,
             lowMid = mLowMid,
             highMid = mHighMid,
             high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 32) {
            auto rshift = 32 - shift;
            mLow = (low >> shift) | (lowMid << rshift);
            mLowMid = (lowMid >> shift) | (highMid << rshift);
            mHighMid = (highMid >> shift) | (high << rshift);
            mHigh = (high >> shift);
        } else if (shift < 64) {
            auto rshift = (shift - 32);
            auto lshift = (32 - rshift) & 0x1F;
            auto lshiftMask = AllOrNothingMask32(lshift);
            mLow = (lowMid >> rshift) | ((highMid << lshift) & lshiftMask);
            mLowMid = (highMid >> rshift) | ((high << lshift) & lshiftMask);
            mHighMid = (high >> rshift);
            mHigh = 0;
        } else if (shift < 96) {
            auto rshift = (shift - 64);
            auto lshift = (64 - rshift) & 0x1F;
            auto lshiftMask = AllOrNothingMask32(lshift);
            mLow = (highMid >> rshift) | ((high << lshift) & lshiftMask);
            mLowMid = (high >> rshift);
            mHighMid = 0;
            mHigh = 0;
        } else {
            mLow = (high >> (shift - 96));
            mLowMid = 0;
            mHighMid = 0;
            mHigh = 0;
        }
#elif CPU_SIZE == CPU_64BIT
        auto low = mLow,
             high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 64) {
            mLow = (low >> shift) | (high << (64 - shift));
            mHigh = (high >> shift);
        } else {
            mLow = (high >> (shift - 64));
            mHigh = 0;
        }
#endif
        return *this;
    }
};

Соответствующий вывод 32-битной сборки довольно длинный, поэтому, если не будет запрошено, я его опущу.

Основным узким местом является, очевидно, ветвление, когда аргумент shift не известен во время компиляции. Что можно сделать, чтобы устранить разветвление, или, если уж на то пошло, какой переносной прием можно использовать для ускорения этого процесса?

Обновление 1

Добавлен оператор назначения копирования, который отсутствовал в приведенном выше примере. Для тех, кто заинтересован, вот модульные тесты. Я использую Catch для простоты.

// Left shift lookup table, from 0-127.
const UInt128 gLeftShiftLut128[] = {
    UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100),
    UInt128(0xFFDDBB9977553310, 0xEECCAA8866442200),
    UInt128(0xFFBB7732EEAA6621, 0xDD995510CC884400),
    UInt128(0xFF76EE65DD54CC43, 0xBB32AA2199108800),
    UInt128(0xFEEDDCCBBAA99887, 0x7665544332211000),
    UInt128(0xFDDBB9977553310E, 0xECCAA88664422000),
    UInt128(0xFBB7732EEAA6621D, 0xD995510CC8844000),
    UInt128(0xF76EE65DD54CC43B, 0xB32AA21991088000),
    UInt128(0xEEDDCCBBAA998877, 0x6655443322110000),
    UInt128(0xDDBB9977553310EE, 0xCCAA886644220000),
    UInt128(0xBB7732EEAA6621DD, 0x995510CC88440000),
    UInt128(0x76EE65DD54CC43BB, 0x32AA219910880000),
    UInt128(0xEDDCCBBAA9988776, 0x6554433221100000),
    UInt128(0xDBB9977553310EEC, 0xCAA8866442200000),
    UInt128(0xB7732EEAA6621DD9, 0x95510CC884400000),
    UInt128(0x6EE65DD54CC43BB3, 0x2AA2199108800000),
    UInt128(0xDDCCBBAA99887766, 0x5544332211000000),
    UInt128(0xBB9977553310EECC, 0xAA88664422000000),
    UInt128(0x7732EEAA6621DD99, 0x5510CC8844000000),
    UInt128(0xEE65DD54CC43BB32, 0xAA21991088000000),
    UInt128(0xDCCBBAA998877665, 0x5443322110000000),
    UInt128(0xB9977553310EECCA, 0xA886644220000000),
    UInt128(0x732EEAA6621DD995, 0x510CC88440000000),
    UInt128(0xE65DD54CC43BB32A, 0xA219910880000000),
    UInt128(0xCCBBAA9988776655, 0x4433221100000000),
    UInt128(0x9977553310EECCAA, 0x8866442200000000),
    UInt128(0x32EEAA6621DD9955, 0x10CC884400000000),
    UInt128(0x65DD54CC43BB32AA, 0x2199108800000000),
    UInt128(0xCBBAA99887766554, 0x4332211000000000),
    UInt128(0x977553310EECCAA8, 0x8664422000000000),
    UInt128(0x2EEAA6621DD99551, 0xCC8844000000000),
    UInt128(0x5DD54CC43BB32AA2, 0x1991088000000000),
    UInt128(0xBBAA998877665544, 0x3322110000000000),
    UInt128(0x77553310EECCAA88, 0x6644220000000000),
    UInt128(0xEEAA6621DD995510, 0xCC88440000000000),
    UInt128(0xDD54CC43BB32AA21, 0x9910880000000000),
    UInt128(0xBAA9988776655443, 0x3221100000000000),
    UInt128(0x7553310EECCAA886, 0x6442200000000000),
    UInt128(0xEAA6621DD995510C, 0xC884400000000000),
    UInt128(0xD54CC43BB32AA219, 0x9108800000000000),
    UInt128(0xAA99887766554433, 0x2211000000000000),
    UInt128(0x553310EECCAA8866, 0x4422000000000000),
    UInt128(0xAA6621DD995510CC, 0x8844000000000000),
    UInt128(0x54CC43BB32AA2199, 0x1088000000000000),
    UInt128(0xA998877665544332, 0x2110000000000000),
    UInt128(0x53310EECCAA88664, 0x4220000000000000),
    UInt128(0xA6621DD995510CC8, 0x8440000000000000),
    UInt128(0x4CC43BB32AA21991, 0x880000000000000),
    UInt128(0x9988776655443322, 0x1100000000000000),
    UInt128(0x3310EECCAA886644, 0x2200000000000000),
    UInt128(0x6621DD995510CC88, 0x4400000000000000),
    UInt128(0xCC43BB32AA219910, 0x8800000000000000),
    UInt128(0x9887766554433221, 0x1000000000000000),
    UInt128(0x310EECCAA8866442, 0x2000000000000000),
    UInt128(0x621DD995510CC884, 0x4000000000000000),
    UInt128(0xC43BB32AA2199108, 0x8000000000000000),
    UInt128(0x8877665544332211, 0x0),
    UInt128(0x10EECCAA88664422, 0x0),
    UInt128(0x21DD995510CC8844, 0x0),
    UInt128(0x43BB32AA21991088, 0x0),
    UInt128(0x8776655443322110, 0x0),
    UInt128(0xEECCAA886644220 , 0x0),
    UInt128(0x1DD995510CC88440, 0x0),
    UInt128(0x3BB32AA219910880, 0x0),
    UInt128(0x7766554433221100, 0x0),
    UInt128(0xEECCAA8866442200, 0x0),
    UInt128(0xDD995510CC884400, 0x0),
    UInt128(0xBB32AA2199108800, 0x0),
    UInt128(0x7665544332211000, 0x0),
    UInt128(0xECCAA88664422000, 0x0),
    UInt128(0xD995510CC8844000, 0x0),
    UInt128(0xB32AA21991088000, 0x0),
    UInt128(0x6655443322110000, 0x0),
    UInt128(0xCCAA886644220000, 0x0),
    UInt128(0x995510CC88440000, 0x0),
    UInt128(0x32AA219910880000, 0x0),
    UInt128(0x6554433221100000, 0x0),
    UInt128(0xCAA8866442200000, 0x0),
    UInt128(0x95510CC884400000, 0x0),
    UInt128(0x2AA2199108800000, 0x0),
    UInt128(0x5544332211000000, 0x0),
    UInt128(0xAA88664422000000, 0x0),
    UInt128(0x5510CC8844000000, 0x0),
    UInt128(0xAA21991088000000, 0x0),
    UInt128(0x5443322110000000, 0x0),
    UInt128(0xA886644220000000, 0x0),
    UInt128(0x510CC88440000000, 0x0),
    UInt128(0xA219910880000000, 0x0),
    UInt128(0x4433221100000000, 0x0),
    UInt128(0x8866442200000000, 0x0),
    UInt128(0x10CC884400000000, 0x0),
    UInt128(0x2199108800000000, 0x0),
    UInt128(0x4332211000000000, 0x0),
    UInt128(0x8664422000000000, 0x0),
    UInt128(0xCC8844000000000 , 0x0),
    UInt128(0x1991088000000000, 0x0),
    UInt128(0x3322110000000000, 0x0),
    UInt128(0x6644220000000000, 0x0),
    UInt128(0xCC88440000000000, 0x0),
    UInt128(0x9910880000000000, 0x0),
    UInt128(0x3221100000000000, 0x0),
    UInt128(0x6442200000000000, 0x0),
    UInt128(0xC884400000000000, 0x0),
    UInt128(0x9108800000000000, 0x0),
    UInt128(0x2211000000000000, 0x0),
    UInt128(0x4422000000000000, 0x0),
    UInt128(0x8844000000000000, 0x0),
    UInt128(0x1088000000000000, 0x0),
    UInt128(0x2110000000000000, 0x0),
    UInt128(0x4220000000000000, 0x0),
    UInt128(0x8440000000000000, 0x0),
    UInt128(0x880000000000000 , 0x0),
    UInt128(0x1100000000000000, 0x0),
    UInt128(0x2200000000000000, 0x0),
    UInt128(0x4400000000000000, 0x0),
    UInt128(0x8800000000000000, 0x0),
    UInt128(0x1000000000000000, 0x0),
    UInt128(0x2000000000000000, 0x0),
    UInt128(0x4000000000000000, 0x0),
    UInt128(0x8000000000000000, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0)
};
// Right shift lookup table, from 0-127.
const UInt128 gRightShiftLut128[] = {
    UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100),
    UInt128(0x7FF76EE65DD54CC4, 0x3BB32AA219910880),
    UInt128(0x3FFBB7732EEAA662, 0x1DD995510CC88440),
    UInt128(0x1FFDDBB997755331, 0xEECCAA886644220),
    UInt128(0xFFEEDDCCBBAA998, 0x8776655443322110),
    UInt128(0x7FF76EE65DD54CC, 0x43BB32AA21991088),
    UInt128(0x3FFBB7732EEAA66, 0x21DD995510CC8844),
    UInt128(0x1FFDDBB99775533, 0x10EECCAA88664422),
    UInt128(0xFFEEDDCCBBAA99, 0x8877665544332211),
    UInt128(0x7FF76EE65DD54C, 0xC43BB32AA2199108),
    UInt128(0x3FFBB7732EEAA6, 0x621DD995510CC884),
    UInt128(0x1FFDDBB9977553, 0x310EECCAA8866442),
    UInt128(0xFFEEDDCCBBAA9, 0x9887766554433221),
    UInt128(0x7FF76EE65DD54, 0xCC43BB32AA219910),
    UInt128(0x3FFBB7732EEAA, 0x6621DD995510CC88),
    UInt128(0x1FFDDBB997755, 0x3310EECCAA886644),
    UInt128(0xFFEEDDCCBBAA, 0x9988776655443322),
    UInt128(0x7FF76EE65DD5, 0x4CC43BB32AA21991),
    UInt128(0x3FFBB7732EEA, 0xA6621DD995510CC8),
    UInt128(0x1FFDDBB99775, 0x53310EECCAA88664),
    UInt128(0xFFEEDDCCBBA, 0xA998877665544332),
    UInt128(0x7FF76EE65DD, 0x54CC43BB32AA2199),
    UInt128(0x3FFBB7732EE, 0xAA6621DD995510CC),
    UInt128(0x1FFDDBB9977, 0x553310EECCAA8866),
    UInt128(0xFFEEDDCCBB, 0xAA99887766554433),
    UInt128(0x7FF76EE65D, 0xD54CC43BB32AA219),
    UInt128(0x3FFBB7732E, 0xEAA6621DD995510C),
    UInt128(0x1FFDDBB997, 0x7553310EECCAA886),
    UInt128(0xFFEEDDCCB, 0xBAA9988776655443),
    UInt128(0x7FF76EE65, 0xDD54CC43BB32AA21),
    UInt128(0x3FFBB7732, 0xEEAA6621DD995510),
    UInt128(0x1FFDDBB99, 0x77553310EECCAA88),
    UInt128(0xFFEEDDCC, 0xBBAA998877665544),
    UInt128(0x7FF76EE6, 0x5DD54CC43BB32AA2),
    UInt128(0x3FFBB773, 0x2EEAA6621DD99551),
    UInt128(0x1FFDDBB9, 0x977553310EECCAA8),
    UInt128(0xFFEEDDC, 0xCBBAA99887766554),
    UInt128(0x7FF76EE, 0x65DD54CC43BB32AA),
    UInt128(0x3FFBB77, 0x32EEAA6621DD9955),
    UInt128(0x1FFDDBB, 0x9977553310EECCAA),
    UInt128(0xFFEEDD, 0xCCBBAA9988776655),
    UInt128(0x7FF76E, 0xE65DD54CC43BB32A),
    UInt128(0x3FFBB7, 0x732EEAA6621DD995),
    UInt128(0x1FFDDB, 0xB9977553310EECCA),
    UInt128(0xFFEED, 0xDCCBBAA998877665),
    UInt128(0x7FF76, 0xEE65DD54CC43BB32),
    UInt128(0x3FFBB, 0x7732EEAA6621DD99),
    UInt128(0x1FFDD, 0xBB9977553310EECC),
    UInt128(0xFFEE, 0xDDCCBBAA99887766),
    UInt128(0x7FF7, 0x6EE65DD54CC43BB3),
    UInt128(0x3FFB, 0xB7732EEAA6621DD9),
    UInt128(0x1FFD, 0xDBB9977553310EEC),
    UInt128(0xFFE, 0xEDDCCBBAA9988776),
    UInt128(0x7FF, 0x76EE65DD54CC43BB),
    UInt128(0x3FF, 0xBB7732EEAA6621DD),
    UInt128(0x1FF, 0xDDBB9977553310EE),
    UInt128(0xFF, 0xEEDDCCBBAA998877),
    UInt128(0x7F, 0xF76EE65DD54CC43B),
    UInt128(0x3F, 0xFBB7732EEAA6621D),
    UInt128(0x1F, 0xFDDBB9977553310E),
    UInt128(0xF, 0xFEEDDCCBBAA99887),
    UInt128(0x7, 0xFF76EE65DD54CC43),
    UInt128(0x3, 0xFFBB7732EEAA6621),
    UInt128(0x1, 0xFFDDBB9977553310),
    UInt128(0x0, 0xFFEEDDCCBBAA9988),
    UInt128(0x0, 0x7FF76EE65DD54CC4),
    UInt128(0x0, 0x3FFBB7732EEAA662),
    UInt128(0x0, 0x1FFDDBB997755331),
    UInt128(0x0, 0xFFEEDDCCBBAA998),
    UInt128(0x0, 0x7FF76EE65DD54CC),
    UInt128(0x0, 0x3FFBB7732EEAA66),
    UInt128(0x0, 0x1FFDDBB99775533),
    UInt128(0x0, 0xFFEEDDCCBBAA99),
    UInt128(0x0, 0x7FF76EE65DD54C),
    UInt128(0x0, 0x3FFBB7732EEAA6),
    UInt128(0x0, 0x1FFDDBB9977553),
    UInt128(0x0, 0xFFEEDDCCBBAA9),
    UInt128(0x0, 0x7FF76EE65DD54),
    UInt128(0x0, 0x3FFBB7732EEAA),
    UInt128(0x0, 0x1FFDDBB997755),
    UInt128(0x0, 0xFFEEDDCCBBAA),
    UInt128(0x0, 0x7FF76EE65DD5),
    UInt128(0x0, 0x3FFBB7732EEA),
    UInt128(0x0, 0x1FFDDBB99775),
    UInt128(0x0, 0xFFEEDDCCBBA),
    UInt128(0x0, 0x7FF76EE65DD),
    UInt128(0x0, 0x3FFBB7732EE),
    UInt128(0x0, 0x1FFDDBB9977),
    UInt128(0x0, 0xFFEEDDCCBB),
    UInt128(0x0, 0x7FF76EE65D),
    UInt128(0x0, 0x3FFBB7732E),
    UInt128(0x0, 0x1FFDDBB997),
    UInt128(0x0, 0xFFEEDDCCB),
    UInt128(0x0, 0x7FF76EE65),
    UInt128(0x0, 0x3FFBB7732),
    UInt128(0x0, 0x1FFDDBB99),
    UInt128(0x0, 0xFFEEDDCC),
    UInt128(0x0, 0x7FF76EE6),
    UInt128(0x0, 0x3FFBB773),
    UInt128(0x0, 0x1FFDDBB9),
    UInt128(0x0, 0xFFEEDDC),
    UInt128(0x0, 0x7FF76EE),
    UInt128(0x0, 0x3FFBB77),
    UInt128(0x0, 0x1FFDDBB),
    UInt128(0x0, 0xFFEEDD),
    UInt128(0x0, 0x7FF76E),
    UInt128(0x0, 0x3FFBB7),
    UInt128(0x0, 0x1FFDDB),
    UInt128(0x0, 0xFFEED),
    UInt128(0x0, 0x7FF76),
    UInt128(0x0, 0x3FFBB),
    UInt128(0x0, 0x1FFDD),
    UInt128(0x0, 0xFFEE),
    UInt128(0x0, 0x7FF7),
    UInt128(0x0, 0x3FFB),
    UInt128(0x0, 0x1FFD),
    UInt128(0x0, 0xFFE),
    UInt128(0x0, 0x7FF),
    UInt128(0x0, 0x3FF),
    UInt128(0x0, 0x1FF),
    UInt128(0x0, 0xFF),
    UInt128(0x0, 0x7F),
    UInt128(0x0, 0x3F),
    UInt128(0x0, 0x1F),
    UInt128(0x0, 0xF),
    UInt128(0x0, 0x7),
    UInt128(0x0, 0x3),
    UInt128(0x0, 0x1)
};

TEST_CASE("UInt128 left shift produces correct results.") {
    auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100);

    for (auto i = 1; i <= 127; i++) {
        auto sample = base;
        sample <<= i;

        INFO("i = " << i);
        REQUIRE(sample == gLeftShiftLut128[i]);
    }
}

TEST_CASE("UInt128 right shift produces correct results.") {
    auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100);

    for (auto i = 0; i <= 127; i++) {
        auto sample = base;
        sample >>= i;

        INFO("i = " << i);
        REQUIRE(sample == gRightShiftLut128[i]);
    }
}

2 ответа

Я не проверял это, но что-то вроде этого не имеет ответвлений:

inline
UInt128& UInt128::operator<<=(Int32 shift) noexcept
{
    auto lshift = shift & 31;
    auto rshift = 31 - lshift;
    UInt32 parts[8] = {
#if CPU_TYPE == CPU_LE32
        0, 0, 0, 0,
        mLow << lshift,
        mLowMid << lshift | mLow >> 1 >> rshift,
        mHighMid << lshift | mLowMid >> 1 >> rshift,
        mHigh << lshift | mHighMid >> 1 >> rshift
#elif CPU_TYPE == CPU_BE32
        mHigh << lshift | mHighMid >> 1 >> rshift,
        mHighMid << lshift | mLowMid >> 1 >> rshift,
        mLowMid << lshift | mLow >> 1 >> rshift,
        mLow << lshift,
        0, 0, 0, 0
#endif
    };
    memcpy(this, &parts[
#if CPU_TYPE == CPU_LE32
        4 -
#endif
        (shift >> 5 & 3)], 16);
    return *this;
}

inline
UInt128& UInt128::operator>>=(Int32 shift) noexcept
{
    auto rshift = shift & 31;
    auto lshift = 31 - rshift;
    UInt32 parts[8] = {
#if CPU_TYPE == CPU_LE32
    mLow >> rshift | mMidLow << lshift << 1,
    mMidLow >> rshift | mMidHigh << lshift << 1,
    mMidHigh >> rshift | mHigh << lshift << 1,
    mHigh >> rshift,
    0, 0, 0, 0
#elif CPU_TYPE == CPU_BE32
    0, 0, 0, 0,
    mHigh >> rshift,
    mMidHigh >> rshift | mHigh << lshift << 1,
    mMidLow >> rshift | mMidHigh << lshift << 1,
    mLow >> rshift | mMidLow << lshift << 1
#endif
    };
    memcpy(this, &parts[
#if CPU_TYPE == CPU_BE32
        4 -
#endif
        (shift >> 5 & 3)], 16);
    return *this;
}

Я бы подумал UInt128 было бы лучше реализовать с использованием массива, где порядок байтов не является проблемой, например,

alignas (16) uint32_t data[4]; или же: alignas (16) uint64_t data[2];

Обратите внимание, что выравнивание не будет гарантировано для объектов, созданных в куче; хотя некоторые ABI имеют минимальное выравнивание 16 байтов. Вы можете проверить с alignof(std::max_align_t), Если нет, вам нужно будет заменить глобального оператора новым и удалить функции для SIMD (например, SSE).

За uint32_t В реализации, вы разделяете сдвиг на сдвиги 'word' и 'bit' - также не имеет смысла иметь подписанный Int32 в качестве счетчика сдвигов...

inline UInt128 &
UInt128::operator <<= (uint32_t shift) noexcept
{
    shift &= 0x7f;
    auto shw = shift / (32); // or (shift >> 5)
    auto shl = shift % (32); // or (shift & 1f)

    // branch-free shift masking:

    uint32_t shm = shl - 1;
    uint32_t shr = (- shl) & (32 - 1);
    shm = (shm >> (32 - 1)) - 1; // 0xffffffff or 0x0

    switch (shw)
    {
    case (3) :
        data[3] = (data[0] << shl);
        data[2] = 0, data[1] = 0, data[0] = 0;
        break;

    case (2) :
        data[3] = (data[1] << shl) | ((data[0] >> shr) & shm);
        data[2] = (data[0] << shl);
        data[1] = 0, data[0] = 0;
        break;

    case (1) :
        data[3] = (data[2] << shl) | ((data[1] >> shr) & shm);
        data[2] = (data[1] << shl) | ((data[0] >> shr) & shm);
        data[1] = (data[0] << shl);
        data[0] = 0;
        break;

    case (0) : // default:
        data[3] = (data[3] << shl) | ((data[2] >> shr) & shm);
        data[2] = (data[2] << shl) | ((data[1] >> shr) & shm);
        data[1] = (data[1] << shl) | ((data[0] >> shr) & shm);
        data[0] = (data[0] << shl);
        // break;
    }

    return *this;
}

Я почти уверен, что у меня правильные индексы данных. Если shift является постоянной времени компиляции, компилятор должен быть в состоянии довольно агрессивно оптимизировать этот код.

Я оставлю правую смену вам, кроме вас, чтобы обновить data от низких до высоких слов, чтобы не перезаписывать слова до того, как они будут прочитаны. В противном случае это должно быть простое упражнение в обмене ролями shl а также shr, uint64_t Версия данных должна быть довольно простой.

Другие вопросы по тегам