https://github.com/BonzaiThePenguin/WikiSort
<http://unlicense.org>
#include "../SortAlgo.h"
#include <algorithm>
#include <vector>
#include <cassert>
#include <cstring>
namespace WikiSortNS {
template <typename IteratorType>
class RangeI {
public:
    typedef IteratorType iterator;
    typedef typename std::iterator_traits<iterator>::value_type value_type;
    iterator start;
    iterator end;
    RangeI() {}
    RangeI(iterator start, iterator end) : start(start), end(end) {}
    inline ssize_t length() const { return end - start; }
};
size_t FloorPowerOfTwo (const size_t value) {
    size_t x = value;
    x = x | (x >> 1);
    x = x | (x >> 2);
    x = x | (x >> 4);
    x = x | (x >> 8);
    x = x | (x >> 16);
#if __LP64__
    x = x | (x >> 32);
#endif
    return x - (x >> 1);
}
template <typename Iterator, typename Comparison>
void InsertionSort(Iterator begin, Iterator end, const Comparison compare) {
    std::__insertion_sort(begin, end, compare);
}
template <typename Iterator>
void BlockSwap(Iterator start1, Iterator start2, const size_t block_size) {
    std::swap_ranges(start1, start1 + block_size, start2);
}
template <typename Iterator>
void Rotate(Iterator begin, Iterator end, const ssize_t amount,
            typename std::iterator_traits<Iterator>::value_type* cache, const size_t cache_size)
{
    if (begin >= end) return;
    Iterator split;
    if (amount >= 0) split = begin + amount;
    else split = end + amount;
    size_t r1 = split - begin;
    size_t r2 = end - split;
    if (0) { 
        
        if (r1 <= r2) {
            if (r1 <= cache_size) {
                std::copy(begin, split, cache);
                std::copy(split, end, begin);
                std::copy(cache, cache + r1, begin + r2);
                return;
            }
        } else {
            if (r2 <= cache_size) {
                std::copy(split, end, cache);
                std::copy_backward(begin, split, end);
                std::copy(cache, cache + r2, begin);
                return;
            }
        }
    }
    std::rotate(begin, split, end);
}
template <typename Iterator, typename Comparison>
void Merge(const RangeI<Iterator>& buffer, const RangeI<Iterator>& A, const RangeI<Iterator>& B,
           const Comparison compare, typename std::iterator_traits<Iterator>::value_type* cache, const size_t cache_size)
{
    typedef typename std::iterator_traits<Iterator>::value_type value_type;
    
    if (A.length() <= cache_size) {
        value_type *A_index = cache, *A_last = cache + A.length();
        Iterator B_index = B.start, B_last = B.end;
        Iterator insert_index = A.start;
        if (B.length() > 0 && A.length() > 0) {
            while (true) {
                if (!compare(*B_index, *A_index)) {
                    *insert_index = *A_index;
                    A_index++;
                    insert_index++;
                    if (A_index == A_last) break;
                } else {
                    *insert_index = *B_index;
                    B_index++;
                    insert_index++;
                    if (B_index == B_last) break;
                }
            }
        }
        
        std::copy(A_index, A_last, insert_index);
    } else {
        
        
        Iterator A_index = buffer.start, B_index = B.start;
        Iterator A_last = buffer.start + A.length(), B_last = B.end;
        Iterator insert_index = A.start;
        if (B.length() > 0 && A.length() > 0) {
            while (true) {
                if (!compare(*B_index, *A_index)) {
                    std::swap(*insert_index, *A_index);
                    A_index++;
                    insert_index++;
                    if (A_index == A_last) break;
                } else {
                    std::swap(*insert_index, *B_index);
                    B_index++;
                    insert_index++;
                    if (B_index == B_last) break;
                }
            }
        }
        std::swap_ranges(A_index, A_last, insert_index);
    }
}
template <typename Iterator, typename Comparison>
void Sort(Iterator first, Iterator last, const Comparison compare)
{
    
    
    const size_t size = last - first;
    typedef typename std::iterator_traits<Iterator>::value_type value_type;
    typedef RangeI<Iterator> Range;
    
    if (0 && size <= 32) {
        InsertionSort(first, last, compare);
        return;
    }
    
    
    
    
    
    
    
    const size_t cache_size = 8;
    value_type cache[cache_size];
    
    
    const size_t base_size = 8;
    const size_t power_of_two = FloorPowerOfTwo(size);
    const size_t fractional_base = power_of_two / base_size;
    size_t fractional_step = size % fractional_base;
    size_t decimal_step = size / fractional_base;
    
    Iterator start = first, mid, end = first;
    size_t fractional = 0;
    while (end < last) {
        end += decimal_step;
        fractional += fractional_step;
        if (fractional >= fractional_base) {
            fractional -= fractional_base;
            end++;
        }
        InsertionSort(start, end, compare);
        start = end;
    }
    
    for (size_t merge_size = base_size; merge_size < power_of_two; merge_size += merge_size) {
        size_t block_size = sqrt(decimal_step);
        size_t buffer_size = decimal_step / block_size + 1;
        
        
        Range level1 = Range(first, first), level2, levelA, levelB;
        size_t decimal = fractional = 0;
        while (decimal < size) {
            start = first + decimal;
            decimal += decimal_step;
            fractional += fractional_step;
            if (fractional >= fractional_base) {
                fractional -= fractional_base;
                decimal++;
            }
            mid = first + decimal;
            decimal += decimal_step;
            fractional += fractional_step;
            if (fractional >= fractional_base) {
                fractional -= fractional_base;
                decimal++;
            }
            end = first + decimal;
            if (compare(*(end - 1), *start)) {
                
                Rotate(start, end, mid - start, cache, cache_size);
            } else if (compare(*mid, *(mid - 1))) {
                
                Range A = Range(start, mid), B = Range(mid, end);
                if (A.length() <= cache_size) {
                    std::copy(A.start, A.end, cache);
                    Merge(Range(first, first), A, B, compare, cache, cache_size);
                    continue;
                }
                
                Range bufferA, bufferB, buffer1, buffer2;
                if (level1.length() > 0) {
                    
                    bufferA = Range(A.start, A.start);
                    bufferB = Range(B.end, B.end);
                    buffer1 = level1;
                    buffer2 = level2;
                } else {
                    
                    size_t count = 1;
                    for (buffer1.start = A.start + 1; buffer1.start < A.end; buffer1.start++)
                        if (compare(*(buffer1.start - 1), *buffer1.start) || compare(*buffer1.start, *(buffer1.start - 1)))
                            if (++count == buffer_size)
                                break;
                    buffer1.end = buffer1.start + count;
                    
                    
                    
                    if (buffer_size <= cache_size) {
                        buffer2 = Range(A.start, A.start);
                        if (buffer1.length() == buffer_size) {
                            
                            bufferA = Range(buffer1.start, buffer1.start + buffer_size);
                            bufferB = Range(B.end, B.end);
                            buffer1 = Range(A.start, A.start + buffer_size);
                        } else {
                            
                            bufferA = Range(buffer1.start, buffer1.start);
                            buffer1 = Range(A.start, A.start);
                            
                            count = 1;
                            for (buffer1.start = B.end - 2; buffer1.start >= B.start; buffer1.start--)
                                if (compare(*buffer1.start, *(buffer1.start + 1)) || compare(*(buffer1.start + 1), *buffer1.start))
                                    if (++count == buffer_size)
                                        break;
                            buffer1.end = buffer1.start + count;
                            if (buffer1.length() == buffer_size) {
                                bufferB = Range(buffer1.start, buffer1.start + buffer_size);
                                buffer1 = Range(B.end - buffer_size, B.end);
                            }
                        }
                    } else {
                        
                        count = 0;
                        for (buffer2.start = buffer1.start + 1; buffer2.start < A.end; buffer2.start++)
                            if (compare(*(buffer2.start - 1), *buffer2.start) || compare(*buffer2.start, *(buffer2.start - 1)))
                                if (++count == buffer_size)
                                    break;
                        buffer2.end = buffer2.start + count;
                        if (buffer2.length() == buffer_size) {
                            
                            bufferA = Range(buffer2.start, buffer2.start + buffer_size * 2);
                            bufferB = Range(B.end, B.end);
                            buffer1 = Range(A.start, A.start + buffer_size);
                            buffer2 = Range(A.start + buffer_size, A.start + buffer_size * 2);
                        } else if (buffer1.length() == buffer_size) {
                            
                            bufferA = Range(buffer1.start, buffer1.start + buffer_size);
                            buffer1 = Range(A.start, A.start + buffer_size);
                            
                            count = 1;
                            for (buffer2.start = B.end - 2; buffer2.start >= B.start; buffer2.start--)
                                if (compare(*buffer2.start, *(buffer2.start + 1)) || compare(*(buffer2.start + 1), *buffer2.start))
                                    if (++count == buffer_size)
                                        break;
                            buffer2.end = buffer2.start + count;
                            if (buffer2.length() == buffer_size) {
                                bufferB = Range(buffer2.start, buffer2.start + buffer_size);
                                buffer2 = Range(B.end - buffer_size, B.end);
                            } else buffer1.end = buffer1.start; 
                        } else {
                            
                            count = 1;
                            for (buffer1.start = B.end - 2; buffer1.start >= B.start; buffer1.start--)
                                if (compare(*buffer1.start, *(buffer1.start + 1)) || compare(*(buffer1.start + 1), *buffer1.start))
                                    if (++count == buffer_size)
                                        break;
                            buffer1.end = buffer1.start + count;
                            count = 0;
                            for (buffer2.start = buffer1.start - 1; buffer2.start >= B.start; buffer2.start--)
                                if (compare(*buffer2.start, *(buffer2.start + 1)) || compare(*(buffer2.start + 1), *buffer2.start))
                                    if (++count == buffer_size)
                                        break;
                            buffer2.end = buffer2.start + count;
                            if (buffer2.length() == buffer_size) {
                                bufferA = Range(A.start, A.start);
                                bufferB = Range(buffer2.start, buffer2.start + buffer_size * 2);
                                buffer1 = Range(B.end - buffer_size, B.end);
                                buffer2 = Range(buffer1.start - buffer_size, buffer1.start);
                            } else buffer1.end = buffer1.start; 
                        }
                    }
                    if (buffer1.length() < buffer_size) {
                        
                        
                        while (A.length() > 0 && B.length() > 0) {
                            
                            Iterator mid = std::lower_bound(B.start, B.end, *A.start, compare);
                            
                            ssize_t amount = mid - A.end;
                            Rotate(A.start, mid, -amount, cache, cache_size);
                            
                            B.start = mid;
                            A = Range(std::upper_bound(A.start, A.end, *(A.start + amount), compare), B.start);
                        }
                        continue;
                    }
                    
                    size_t length = bufferA.length(); count = 0;
                    for (Iterator index = bufferA.start; count < length; index--) {
                        if (index == A.start || compare(*(index - 1), *index) || compare(*index, *(index - 1))) {
                            Rotate(index + 1, bufferA.start + 1, -count, cache, cache_size);
                            bufferA.start = index + count; count++;
                        }
                    }
                    bufferA = Range(A.start, A.start + length);
                    
                    length = bufferB.length(); count = 0;
                    for (Iterator index = bufferB.start; count < length; index++) {
                        if (index == B.end - 1 || compare(*index, *(index + 1)) || compare(*(index + 1), *index)) {
                            Rotate(bufferB.start, index, count, cache, cache_size);
                            bufferB.start = index - count; count++;
                        }
                    }
                    bufferB = Range(B.end - length, B.end);
                    
                    level1 = buffer1;
                    level2 = buffer2;
                    levelA = bufferA;
                    levelB = bufferB;
                }
                
                Range blockA = Range(bufferA.end, A.end);
                Range firstA = Range(bufferA.end, bufferA.end + blockA.length() % block_size);
                
                for (Iterator index = buffer1.start, indexA = firstA.end + 1; indexA < blockA.end; index++, indexA += block_size)
                    std::swap(*index, *indexA);
                
                
                Range lastA = firstA;
                Range lastB = Range(first, first);
                Range blockB = Range(B.start, B.start + std::min<size_t>(block_size, B.length() - bufferB.length()));
                blockA.start += firstA.length();
                Iterator minA = blockA.start, indexA = buffer1.start;
                value_type min_value = *minA;
                if (lastA.length() <= cache_size)
                    std::copy(lastA.start, lastA.end, cache);
                else
                    std::swap_ranges(lastA.start, lastA.end, buffer2.start);
                while (true) {
                    
                    
                    if ((lastB.length() > 0 && !compare(*(lastB.end - 1), min_value)) || blockB.length() == 0) {
                        
                        Iterator B_split = std::lower_bound(lastB.start, lastB.end, min_value, compare);
                        size_t B_remaining = lastB.end - B_split;
                        
                        BlockSwap(blockA.start, minA, block_size);
                        
                        
                        std::swap(*(blockA.start + 1), *(indexA++));
                        
                        Merge(buffer2, lastA, Range(lastA.end, B_split), compare, cache, cache_size);
                        
                        if (block_size <= cache_size)
                            std::copy(blockA.start, blockA.start + block_size, cache);
                        else
                            BlockSwap(blockA.start, buffer2.start, block_size);
                        
                        
                        
                        BlockSwap(B_split, blockA.start + block_size - B_remaining, B_remaining);
                        
                        lastA = Range(blockA.start - B_remaining, blockA.start - B_remaining + block_size);
                        lastB = Range(lastA.end, lastA.end + B_remaining);
                        blockA.start += block_size;
                        if (blockA.length() == 0)
                            break;
                        
                        minA = blockA.start + 1;
                        for (Iterator findA = minA + block_size; findA < blockA.end; findA += block_size)
                            if (compare(*findA, *minA)) minA = findA;
                        minA = minA - 1; 
                        min_value = *minA;
                    } else if (blockB.length() < block_size) {
                        
                        
                        Rotate(blockA.start, blockB.end, -blockB.length(), cache, 0);
                        lastB = Range(blockA.start, blockA.start + blockB.length());
                        blockA.start += blockB.length();
                        blockA.end += blockB.length();
                        minA += blockB.length();
                        blockB.end = blockB.start;
                    } else {
                        
                        BlockSwap(blockA.start, blockB.start, block_size);
                        lastB = Range(blockA.start, blockA.start + block_size);
                        if (minA == blockA.start)
                            minA = blockA.end;
                        blockA.start += block_size;
                        blockA.end += block_size;
                        blockB.start += block_size;
                        blockB.end += block_size;
                        if (blockB.end > bufferB.start)
                            blockB.end = bufferB.start;
                    }
                }
                
                Merge(buffer2, lastA, Range(lastA.end, B.end - bufferB.length()), compare, cache, cache_size);
            }
        }
        if (level1.length() > 0) {
            
            
            InsertionSort(level2.start, level2.end, compare);
            
            Iterator level_start = levelA.start;
            for (Iterator index = levelA.end; levelA.length() > 0; index++) {
                if (index == levelB.start || !compare(*index, *levelA.start)) {
                    ssize_t amount = index - levelA.end;
                    Rotate(levelA.start, index, -amount, cache, cache_size);
                    levelA.start += amount + 1;
                    levelA.end += amount;
                    index--;
                }
            }
            
            for (Iterator index = levelB.start; levelB.length() > 0; index--) {
                if (index == level_start || !compare(*(levelB.end - 1), *(index - 1))) {
                    ssize_t amount = levelB.start - index;
                    Rotate(index, levelB.end, amount, cache, cache_size);
                    levelB.start -= amount;
                    levelB.end -= amount + 1;
                    index++;
                }
            }
        }
        decimal_step += decimal_step;
        fractional_step += fractional_step;
        if (fractional_step >= fractional_base) {
            fractional_step -= fractional_base;
            decimal_step++;
        }
    }
}
} 
struct ColoringComparator
{
    WSortView& m_array;
    const ArrayItem *m_begin, *m_end;
    ColoringComparator(WSortView& A)
        : m_array(A),
          m_begin( &(A.direct(0)) ),
          m_end( &(A.direct(A.size()-1)) )
    { }
    void touch(const ArrayItem* x) const
    {
        if (x < m_begin || x > m_end) return;
        unsigned int index = x - m_begin;
        m_array.touch(index, 16, 1, 10);
    }
    bool operator()(const ArrayItem& a, const ArrayItem& b) const
    {
        touch(&a);
        touch(&b);
        return a < b;
    }
};
void WikiSort(WSortView& A)
{
    ColoringComparator cmp(A);
    WikiSortNS::Sort(MyIterator(&A,0), MyIterator(&A,A.size()), cmp);
}