2

I try to implement a simple coroutine using c.

The platform is:

  • M3 Pro MacBook Pro 16
  • apple native gcc
  • macOS 14.3.1

Here is my code:

// main.c

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

#define STACK_SIZE 1024

typedef struct {
    u_int64_t x0;
    u_int64_t sp;
    u_int64_t x29;
    u_int64_t x30;
   
    u_int64_t endAddr;
    u_int64_t endSp;
    u_int64_t endX29;
    u_int64_t endX30;

    uint64_t entry;
    uint64_t stack;
    size_t size;
    volatile int dead;
    volatile int start;
} Routine;

typedef void (*Entry)();

volatile Routine* current;
Routine queue[2];

void switchToRoutine();

void markCurrentDead() {
    current->dead = 1;
};

void wrapper(Entry entry) {
    entry();
    markCurrentDead();
    switchToRoutine();
}

void createRoutine(Entry entry) {
    u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE);
    if (stack == NULL) {
        perror("Failed to allocate stack memory");
        exit(EXIT_FAILURE);
    }
    Routine r;
    uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF;
    r.sp = aligned_sp;
    r.x29 = aligned_sp;
    r.entry = (uint64_t)wrapper;
    r.x30 = (uint64_t)wrapper;
    r.size = STACK_SIZE - 1;
    r.dead = 0;
    r.stack = (uint64_t)stack;
    r.x0 = (uint64_t)entry;
    r.start = 0;

    queue[1] = r;
}

void createMainRoutine(Entry entry) {
    u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE);
    if (stack == NULL) {
        perror("Failed to allocate stack memory");
        exit(EXIT_FAILURE);
    }
    Routine r;
    uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF;
    r.sp = aligned_sp;
    r.x29 = aligned_sp;
    r.entry = (uint64_t)entry;
    r.x30 = (uint64_t)entry;
    r.size = STACK_SIZE - 1;
    r.dead = 0;
    r.stack = (uint64_t)stack;
    r.x0 = (uint64_t)entry;

    queue[0] = r;
}

void switchToChildRoutine(Routine* routine) {
    __asm__ volatile(
        "add %0, sp, #32\n\t"
        "mov %1, x29\n\t"
        "mov %2, x30\n\t"
        : "=r"(current->sp), "=r"(current->x29), "=r"(current->x30)
        :
        :"memory"
    );
    current = routine;
    if (current->start == 0) {
        current->start = 1;
        __asm__ volatile(
            "ldr x10, [%0]\n\t"
            "mov sp, x10\n\t"
            "ldr x29, [%1]\n\t"
            "ldr x30, [%2]\n\t"
            "ldr x0, [%3]\n\t"
            "ret\n\t"
            :
            :"r"(&current->sp), "r"(&current->x29), "r"(&current->x30), "r"(&current->x0)
            :"x10", "x30", "memory", "x0"
        );
    } else {
        __asm__ volatile(
            "ldr x10, [%0]\n\t"
            "mov sp, x10\n\t"
            "ldr x29, [%1]\n\t"
            "ldr x30, [%2]\n\t"
            "ret\n\t"
            :
            :"r"(&current->sp), "r"(&current->x29), "r"(&current->x30)
            :"x10", "x30", "memory"
        );
    }
}

void switchToRoutine() {
    __asm__ volatile(
        "add x1, sp, #16\n\t"
        "str x1, [%0]\n\t"
        "ldr x1, [sp]\n\t"
        "str x1, [%1]\n\t"
        "ldr x1, [sp, #8]\n\t"
        "str x1, [%2]\n\t"
        : 
        :"r"(&current->sp), "r"(&current->x29), "r"(&current->x30)
        :"memory", "x1"
    );
    current = &queue[0];
    __asm__ volatile(
        "ldr x10, [%0]\n\t"
        "mov sp, x10\n\t"
        "ldr x29, [%1]\n\t"
        "ldr x30, [%2]\n\t"
        "ret\n\t"
        :
        :"r"(&current->sp), "r"(&current->x29), "r"(&current->x30)
        :"x10", "x30", "memory"
    );
}

void hello() {
    printf("hello\n");
    //switchToRoutine();
    printf("world\n");
    printf("111\n");
}

Routine* queueZero() {
    return &queue[0];
}

Routine* queueOne() {
    return &queue[1];
}

void mainRoutineEntry() {
    while(1) {
        printf("enter main routine\n");
        Routine* b = queueOne();
        if (b->dead == 1) {
            printf("yes\n");
            Routine* a = queueZero();
            __asm__ volatile(
                "mov sp, %0\n\t"
                "mov x29, %1\n\t"
                "mov x30, %2\n\t"
                "ret\n\t"
                :
                :"r"(a->endSp), "r"(a->endX29), "r"(a->endX30)
            );
        } else {
            switchToChildRoutine(b);
        }
    }
};

void execute() {
    volatile uint64_t x30 = 0;
    __asm__ volatile(
        "mov %0, x30\n\t"
        :"=r"(x30)
        :
    );
    current = &queue[0];
    current->endX30 = x30;
    __asm__ volatile(
        "add x2, sp, #32\n\t"
        "str x2, [%0]\n\t"
        "str x29, [%1]\n\t"
        "ldr x2, [%3]\n\t"
        "mov sp, x2\n\t"
        "ldr x29, [%4]\n\t"
        "ldr x30, [%5]\n\t"
        "ret\n\t"
        :
        :"r"(&queue[0].endSp), "r"(&queue[0].endX29), "r"(&queue[0].endX30),"r"(&queue[0].sp), "r"(&queue[0].x29), "r"(&queue[0].x30)
        :"x2", "x30", "memory"
    );
}

void release() {
    free((uint8_t*)(queue[0].stack));
    free((uint8_t*)(queue[1].stack));
}

int main() {
    createRoutine(hello);

    createMainRoutine(mainRoutineEntry);
    execute();
    
    // release();

    free((uint8_t*)(queue[0].stack));
    free((uint8_t*)(queue[1].stack));
    
    printf("ok, that's right\n");
    printf("wow\n");

    return 0;
}

Compile and run:

gcc main.c -o main
./main

Run gcc -v, here is the output:

Apple clang version 15.0.0 (clang-1500.1.0.2.5)
Target: arm64-apple-darwin23.3.0
Thread model: posix
InstalledDir: /Library/Developer/CommandLineTools/usr/bin

I run the program many times, sometimes it's right, sometimes it's wrong, and I get an error:

demo(7527,0x1da301c40) malloc: Region cookie corrupted for region 0x159800000 (value is 0)[0x1598081fc]
demo(7527,0x1da301c40) malloc: *** set a breakpoint in malloc_error_break to debug
[1]    7527 abort      ./demo

I don't know what it is going on like this.

If you refactor main function like this:

int main() {
    createRoutine(hello);

    createMainRoutine(mainRoutineEntry);
    execute();
    
    release();

    // free((uint8_t*)(queue[0].stack));
    // free((uint8_t*)(queue[1].stack));
    
    printf("ok, that's right\n");
    printf("wow\n");

    return 0;
}

It's absolutely failed to run.

I guess that there's something wrong in execute function, I check the code line by line, especially these inline assemble code. It looks all right. x29, sp, x30 registers are loaded or stored in a good manner.On the other hand, I think it might be caused by global variable current and queue.After I refactor them in a routine stack memory, and move to another routine stack memory, the code might not read the newest value of these global variables.Unfortunately, I am not excellent in assemble code, I cannot prove that.

I hope I can run it successfully, and figure out the reason behind the phenomenon.


Thank PeterCordes, I refactor my code and everything is ok, here is the new code:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

#define STACK_SIZE 1024

typedef struct {
    u_int64_t x0;
    u_int64_t sp;
    u_int64_t x29;
    u_int64_t x30;
    u_int64_t endAddr;
    u_int64_t endSp;
    u_int64_t endX29;
    u_int64_t endX30;

    uint64_t entry;
    uint64_t stack;
    size_t size;
    volatile int dead;
    volatile int start;
} Routine;

Routine* queueZero();
Routine* queueOne();

typedef void (*Entry)();

Routine* current;
Routine queue[2];

void switchToRoutine();

void markCurrentDead() {
    current->dead = 1;
};

void wrapper(Entry entry) {
    entry();
    markCurrentDead();
    printf("ehy\n");
    switchToRoutine();
}

 
void createRoutine(Entry entry) {
    u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE);
    if (stack == NULL) {
        perror("Failed to allocate stack memory");
        exit(EXIT_FAILURE);
    }
    Routine r;
    uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF; 
    r.sp = aligned_sp;
    r.x29 = aligned_sp;
    r.entry = (uint64_t)wrapper;
    r.x30 = (uint64_t)wrapper;
    r.size = STACK_SIZE - 1;
    r.dead = 0;
    r.stack = (uint64_t)stack;
    r.x0 = (uint64_t)entry;
    r.start = 0;

    queue[1] = r;
}

void execute();
void release();
void mainRoutineEntry();

void createMainRoutine(Entry entry) {
    u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE);
    if (stack == NULL) {
        perror("Failed to allocate stack memory");
        exit(EXIT_FAILURE);
    }
    Routine r;
    uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF; 
    r.sp = aligned_sp;
    r.x29 = aligned_sp;
    r.entry = (uint64_t)entry;
    r.x30 = (uint64_t)entry;
    r.size = STACK_SIZE - 1;
    r.dead = 0;
    r.stack = (uint64_t)stack;
    r.x0 = (uint64_t)entry;

    queue[0] = r;
}

void create() {
    createMainRoutine(mainRoutineEntry);
    execute();
    release();
}


__attribute__((naked)) void switchToInitChildRoutine() {
    __asm__ volatile(
        "mov %0, sp\n\t"
        "mov %1, x29\n\t"
        "mov %2, x30\n\t"
        : "=r"(queue[0].sp), "=r"(queue[0].x29), "=r"(queue[0].x30)
        :
        :"memory"
    );
   
    __asm__ volatile(
        "ldr x10, [%0]\n\t"
        "mov sp, x10\n\t"
        "ldr x29, [%1]\n\t"
        "ldr x30, [%2]\n\t"
        "ldr x0, [%3]\n\t"
        "ret\n\t"
        :
        :"r"(&current->sp), "r"(&current->x29), "r"(&current->x30), "r"(&current->x0)
        :"x10", "x30", "memory", "x0"
    );
    
}

__attribute__((naked)) void switchToChildRoutine() {
    __asm__ volatile(
        "mov %0, sp\n\t"
        "mov %1, x29\n\t"
        "mov %2, x30\n\t"
        : "=r"(queue[0].sp), "=r"(queue[0].x29), "=r"(queue[0].x30)
        :
        :"memory"
    );
    __asm__ volatile(
        "mov x10, %0\n\t"
        "mov sp, x10\n\t"
        "mov x29, %1\n\t"
        "mov x30, %2\n\t"
        "ret\n\t"
        : 
        :"r"(current->sp), "r"(current->x29), "r"(current->x30)
        :"x10", "x30", "memory"
    );
}



__attribute__((naked)) void switchToRoutine() {
    __asm__ volatile(
        "mov x1, sp\n\t"
        "mov %0, x1\n\t"
        "mov x1, x29\n\t"
        "mov %1, x1\n\t"
        "mov x1, x30\n\t"
        "mov %2, x1\n\t"
        : "=r"(current->sp), "=r"(current->x29), "=r"(current->x30)
        :
        :"memory", "x1"
    );
    
    __asm__ volatile(
        "mov x10, %0\n\t"
        "mov sp, x10\n\t"
        "mov x29, %1\n\t"
        "mov x30, %2\n\t"
        "ret\n\t"
        :
        :"r"(queue[0].sp), "r"(queue[0].x29), "r"(queue[0].x30)
        :"x10", "x30", "memory"
    );
}

void hello() {
    printf("hello\n");
    switchToRoutine();
    printf("world\n");
    switchToRoutine();
    printf("111\n");
}


void mainRoutineEntry() {
    while(1) {
        // current = queueZero();
        current = &queue[0];
        printf("enter main routine\n");
        Routine* b = &queue[1];
        if (b->dead == 1) {
            printf("yes\n");
            __asm__ volatile(
                "mov sp, %0\n\t"
                "mov x29, %1\n\t"
                "mov x30, %2\n\t"
                "ret\n\t"
                :
                :"r"(current->endSp), "r"(current->endX29), "r"(current->endX30)
            );
        } else {
            current = b;
            if (b->start == 0) {
                b->start = 1;
                switchToInitChildRoutine();
            } else {
                switchToChildRoutine();
            }
        }
    }
};

__attribute__((naked)) void execute() {
    __asm__ volatile(
        "mov x2, sp\n\t"
        "str x2, [%0]\n\t"
        "str x29, [%1]\n\t"
        "str x30, [%2]\n\t"
        "ldr x2, [%3]\n\t"
        "mov sp, x2\n\t"
        "ldr x29, [%4]\n\t"
        "ldr x30, [%5]\n\t"
        "ret\n\t"
        :
        :"r"(&queue[0].endSp), "r"(&queue[0].endX29), "r"(&queue[0].endX30),"r"(&queue[0].sp), "r"(&queue[0].x29), "r"(&queue[0].x30)
        :"x2", "x30", "memory"
    );
}

void release() {
    free((uint8_t*)(queue[0].stack));
    free((uint8_t*)(queue[1].stack));
}

int main() {
    createRoutine(hello);

    create();
    
    printf("ok, that's right\n");
    printf("wow\n");

    return 0;
}

You have to compile it by -O1, -O0 not works.

7
  • In case you care, you can achieve coros with ucontext. Example Commented Oct 30, 2024 at 14:24
  • 1
    Have you looked at the compiler-generated asm around your inline asm templates? Do the compiler-generated function prologues do stuff that you fail to undo before you manual ret inside one of the asm statements? That's highly suspicious, and will probably be different at -O0 vs. -O3. Normally a context-switch function should be __attribute__((naked)) so you write the whole function in assembly, not mixing compiler-generated C with your own ret or modification of sp (which also isn't officially supported, although you're at least avoiding "m" operands.) Commented Oct 30, 2024 at 18:07
  • @PeterCordes Thanks.I take your advice and reimplement execute switchToRoutine switchToChildRoutine in __attribute__((naked)) style.It really helps me out.But there's a difference.If I compile using -O0, program still cannot work well, instead, I compile using -O1, it works well. It's so mysterious, I don't know why. I still learn something from this problem: Be cautious with inline assemble codes, it's not as you think. By the way, authors who develop coroutine library, are genius, cause they master the assembly code tech and overcome any problems. Commented Oct 31, 2024 at 7:28
  • @ikegami Thank you for your advice. Example looks cool, but I am not familiar with ucontext. I will spend some time learning and figure it out.And now, I just want to write a simple stackful coroutine for fun, not depending on multiple threads yet.Someday, I might extend my codes to support multiple thread case. Commented Oct 31, 2024 at 7:34
  • 1
    Re "not depending on multiple threads yet", A coro is just a co-operative user-space thread. I used the word thread in that example since that's what the OP had used, but you can rename all of those to "coro" if you want. Commented Oct 31, 2024 at 10:06

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.