c++ – Why is the custom allocator significantly faster after adding a line of “memset”?

I am writing a “freelist allocator”. I started with a line of memset in the initialization phase for debugging purposes. Later I removed the memset. But I found that the allocation part in the testing function test1_managed is running 3-4 times faster with the memset in the initialization than the one without. I am confused with the result. What reason could it be?

I compiled the code with Release mode of MSVC 19.29.30140 with “/O2” optimization enabled.

Here is the code:

#pragma once
#include <cstdio>
#include <iostream>
#include <chrono>

using namespace std;
using namespace chrono;

const uint64_t MemSize = 1 << 20;
const uint64_t TestSize = 1 << 12;
typedef int TestType;

struct MemSegInfo {
    MemSegInfo* m_next{ 0 };
    MemSegInfo* m_nextFree{ 0 };
    uint64_t m_size{ 0 };
    uint64_t m_handle{ 0 };
};
const uint64_t infoSize = sizeof(MemSegInfo);
const uint64_t ptrSize = sizeof(MemSegInfo*);
const uint64_t maxAlignment = 16;

char* m_dataBuffer{ 0 };
char* m_dataBufferEnd{ 0 };
MemSegInfo* m_head;

MemSegInfo m_segFree;
MemSegInfo* m_segFreeCursor;

inline void* OffsetFromMemSegInfo(MemSegInfo* seg) {
    return ((char*)seg) + infoSize;
}
inline MemSegInfo* OffsetToMemSegInfo(void* ptr) {
    return (MemSegInfo*)(((char*)ptr) - infoSize);
}

template<class T>
inline void SetMem(MemSegInfo* seg, T&& val) {
    *((T*)OffsetFromMemSegInfo(seg)) = val;
}
template<class T>
inline void SetMem(MemSegInfo* seg, const T& val) {
    *((T*)OffsetFromMemSegInfo(seg)) = val;
}
template<class T>
inline T GetMem(MemSegInfo* seg) {
    return *((T*)OffsetFromMemSegInfo(seg));
}

void Init() {
    m_dataBuffer = (char*)malloc(MemSize);
    if (!m_dataBuffer)
        throw "Out of mem! ";
    m_dataBufferEnd = m_dataBuffer + MemSize;

    m_head = (MemSegInfo*)m_dataBuffer;
    m_head->m_next = m_head;
    //m_head->m_last = m_head;
    m_head->m_size = MemSize - infoSize;
    m_head->m_nextFree = &m_segFree;

    m_segFree.m_size = 0;
    m_segFree.m_nextFree = m_head;

    m_segFreeCursor = &m_segFree;

    /***THE MEMSET I AM TALKING ABOUT***/
    memset(OffsetFromMemSegInfo(m_head), 0xfafafafa, m_head->m_size);
}

MemSegInfo* Allocate(size_t size) {
    const uint64_t sizeAligned = ((size - 1) / maxAlignment + 1) * maxAlignment;
    const uint64_t sizeAlloc = sizeAligned + infoSize;
    MemSegInfo* segCurPrev = m_segFreeCursor;
    while (segCurPrev->m_nextFree->m_size < sizeAlloc) { // Go through freelist to find the First Fit. 
        segCurPrev = segCurPrev->m_nextFree;
        if (segCurPrev == m_segFreeCursor)
            throw "Out of mem! ";
    }
    MemSegInfo* const segCur = segCurPrev->m_nextFree;
    const uint64_t sizeRest = segCur->m_size - sizeAligned;
    if (sizeRest <= infoSize) { // There is not enough space for another seg. Just gonna use it. 
        segCurPrev->m_nextFree = segCur->m_nextFree; // This can also deal with the case where this is the last one available. 
        m_segFreeCursor = segCurPrev;
    }
    else { // There is enough space for another seg. Separate and make a new seg. 
        MemSegInfo* const segNew = (MemSegInfo*)(((char*)segCur) + sizeAlloc);
        MemSegInfo* const segNext = segCur->m_next;
        MemSegInfo* const segNextFree = segCur->m_nextFree;

        // Rearrange seg list
        //segNew->m_last = segCur;
        //segNext->m_last = segNew;
        segNew->m_next = segNext;
        segCur->m_next = segNew;
        segNew->m_size = sizeRest - infoSize;
        segCur->m_size = sizeAligned;
        segCur->m_nextFree = nullptr;

        // Join freelist
        segNew->m_nextFree = segNextFree;
        segCurPrev->m_nextFree = segNew;
        m_segFreeCursor = segCurPrev;
    }
    return segCur;
}

inline void Free(MemSegInfo* segFree) {
    // Join the freelist
    segFree->m_nextFree = m_segFree.m_nextFree;
    m_segFree.m_nextFree = segFree->m_nextFree;
}

void Cleanup() {
    free(m_dataBuffer);
}

MemSegInfo* mems[16];

struct MyStruct
{
    uint64_t m_a;
    uint64_t m_b;
    uint32_t m_c;
    uint32_t m_d;
};

void test0() {
    Init();

    mems[0] = Allocate(8);
    SetMem<uint64_t>(mems[0], 128);
    uint64_t t0 = GetMem<uint64_t>(mems[0]);
    mems[1] = Allocate(4);
    uint64_t t1 = GetMem<uint64_t>(mems[0]);
    SetMem<uint32_t>(mems[1], 256);
    uint64_t t2 = GetMem<uint64_t>(mems[0]);
    mems[2] = Allocate(64);
    int* a = (int*)OffsetFromMemSegInfo(mems[2]);
    for (int i = 0; i < 16; ++i) {
        a[i] = i;
    }
    mems[3] = Allocate(sizeof(MyStruct) * 8);
    MyStruct* b = (MyStruct*)OffsetFromMemSegInfo(mems[3]);
    for (int i = 0; i < 8; ++i) {
        b[i] = { (uint64_t)i, (uint64_t)i * 2, (uint32_t)i * 3, (uint32_t)i * 4 };
    }

    for (int i = 0; i < 16; ++i) {
        printf("%d ", a[i]);
    }
    printf("n");

    Free(mems[1]);
    printf("%lldn", GetMem<uint64_t>(mems[0]));
    for (int i = 0; i < 16; ++i) {
        printf("%d ", a[i]);
    }
    printf("n");
    for (int i = 0; i < 8; ++i) {
        printf("%lld %lld %ld %ldt", b[i].m_a, b[i].m_b, b[i].m_c, b[i].m_d);
    }

    Cleanup();
}


TestType* bufferManaged[TestSize];
void test1_managed() {
    Init();
    
    /***ALLOCATION START***/
    system_clock::time_point beg_alloc = system_clock::now();
    TestType sum = 0;
    for (uint64_t i = 0; i < TestSize; ++i) {
        TestType val = i % INT32_MAX;
        bufferManaged[i] = new (OffsetFromMemSegInfo(Allocate(sizeof(TestType)))) TestType(val);
        sum += *bufferManaged[i];
    }
    /***ALLOCATION END***/
    std::atomic_signal_fence(std::memory_order_seq_cst);
    system_clock::time_point end_alloc = system_clock::now();
    printf("managed %llu nsn", (std::chrono::duration_cast<std::chrono::nanoseconds>(end_alloc - beg_alloc)).count());
    printf("sum %dn", sum);

    system_clock::time_point beg_free = system_clock::now();
    for (uint64_t i = 0; i < TestSize; ++i) {
        Free((MemSegInfo*)OffsetToMemSegInfo(bufferManaged[i]));
    }
    Cleanup();
    std::atomic_signal_fence(std::memory_order_seq_cst);
    system_clock::time_point end_free = system_clock::now();
    printf("managedfree %llu nsnn", (std::chrono::duration_cast<std::chrono::nanoseconds>(end_free - beg_free)).count());
}

TestType* bufferUnmanaged[TestSize];
void test1_unmnged() {
    system_clock::time_point beg_alloc = system_clock::now();
    TestType sum = 0;
    for (uint64_t i = 0; i < TestSize; ++i) {
        TestType val = i % INT32_MAX;
        bufferUnmanaged[i] = new TestType(val);
        sum += *bufferUnmanaged[i];
    }
    std::atomic_signal_fence(std::memory_order_seq_cst);
    system_clock::time_point end_alloc = system_clock::now();
    printf("unmnged %llu nsn", (std::chrono::duration_cast<std::chrono::nanoseconds>(end_alloc - beg_alloc)).count());
    printf("sum %dn", sum);

    system_clock::time_point beg_free = system_clock::now();
    for (uint64_t i = 0; i < TestSize; ++i) {
        delete bufferUnmanaged[i];
    }
    std::atomic_signal_fence(std::memory_order_seq_cst);
    system_clock::time_point end_free = system_clock::now();
    printf("unmngedfree %llu nsnn", (std::chrono::duration_cast<std::chrono::nanoseconds>(end_free - beg_free)).count());
}

void test1() {
    std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
    auto epoch = now.time_since_epoch();
    auto value = std::chrono::duration_cast<std::chrono::milliseconds>(epoch);
    long duration = value.count();
    srand(duration);

    for (int i = 0; i < 10; i++) {
        printf("test%dn", i);
        test1_managed();
        test1_unmnged();
        printf("n");
    }
}

void test() {
    test1();
}

Leave a Comment