I try to implement a simple coroutine using c.
The platform is:
- M3 Pro MacBook Pro 16
- apple native gcc
- macOS 14.3.1
Here is my code:
// main.c
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define STACK_SIZE 1024
typedef struct {
u_int64_t x0;
u_int64_t sp;
u_int64_t x29;
u_int64_t x30;
u_int64_t endAddr;
u_int64_t endSp;
u_int64_t endX29;
u_int64_t endX30;
uint64_t entry;
uint64_t stack;
size_t size;
volatile int dead;
volatile int start;
} Routine;
typedef void (*Entry)();
volatile Routine* current;
Routine queue[2];
void switchToRoutine();
void markCurrentDead() {
current->dead = 1;
};
void wrapper(Entry entry) {
entry();
markCurrentDead();
switchToRoutine();
}
void createRoutine(Entry entry) {
u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE);
if (stack == NULL) {
perror("Failed to allocate stack memory");
exit(EXIT_FAILURE);
}
Routine r;
uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF;
r.sp = aligned_sp;
r.x29 = aligned_sp;
r.entry = (uint64_t)wrapper;
r.x30 = (uint64_t)wrapper;
r.size = STACK_SIZE - 1;
r.dead = 0;
r.stack = (uint64_t)stack;
r.x0 = (uint64_t)entry;
r.start = 0;
queue[1] = r;
}
void createMainRoutine(Entry entry) {
u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE);
if (stack == NULL) {
perror("Failed to allocate stack memory");
exit(EXIT_FAILURE);
}
Routine r;
uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF;
r.sp = aligned_sp;
r.x29 = aligned_sp;
r.entry = (uint64_t)entry;
r.x30 = (uint64_t)entry;
r.size = STACK_SIZE - 1;
r.dead = 0;
r.stack = (uint64_t)stack;
r.x0 = (uint64_t)entry;
queue[0] = r;
}
void switchToChildRoutine(Routine* routine) {
__asm__ volatile(
"add %0, sp, #32\n\t"
"mov %1, x29\n\t"
"mov %2, x30\n\t"
: "=r"(current->sp), "=r"(current->x29), "=r"(current->x30)
:
:"memory"
);
current = routine;
if (current->start == 0) {
current->start = 1;
__asm__ volatile(
"ldr x10, [%0]\n\t"
"mov sp, x10\n\t"
"ldr x29, [%1]\n\t"
"ldr x30, [%2]\n\t"
"ldr x0, [%3]\n\t"
"ret\n\t"
:
:"r"(¤t->sp), "r"(¤t->x29), "r"(¤t->x30), "r"(¤t->x0)
:"x10", "x30", "memory", "x0"
);
} else {
__asm__ volatile(
"ldr x10, [%0]\n\t"
"mov sp, x10\n\t"
"ldr x29, [%1]\n\t"
"ldr x30, [%2]\n\t"
"ret\n\t"
:
:"r"(¤t->sp), "r"(¤t->x29), "r"(¤t->x30)
:"x10", "x30", "memory"
);
}
}
void switchToRoutine() {
__asm__ volatile(
"add x1, sp, #16\n\t"
"str x1, [%0]\n\t"
"ldr x1, [sp]\n\t"
"str x1, [%1]\n\t"
"ldr x1, [sp, #8]\n\t"
"str x1, [%2]\n\t"
:
:"r"(¤t->sp), "r"(¤t->x29), "r"(¤t->x30)
:"memory", "x1"
);
current = &queue[0];
__asm__ volatile(
"ldr x10, [%0]\n\t"
"mov sp, x10\n\t"
"ldr x29, [%1]\n\t"
"ldr x30, [%2]\n\t"
"ret\n\t"
:
:"r"(¤t->sp), "r"(¤t->x29), "r"(¤t->x30)
:"x10", "x30", "memory"
);
}
void hello() {
printf("hello\n");
//switchToRoutine();
printf("world\n");
printf("111\n");
}
Routine* queueZero() {
return &queue[0];
}
Routine* queueOne() {
return &queue[1];
}
void mainRoutineEntry() {
while(1) {
printf("enter main routine\n");
Routine* b = queueOne();
if (b->dead == 1) {
printf("yes\n");
Routine* a = queueZero();
__asm__ volatile(
"mov sp, %0\n\t"
"mov x29, %1\n\t"
"mov x30, %2\n\t"
"ret\n\t"
:
:"r"(a->endSp), "r"(a->endX29), "r"(a->endX30)
);
} else {
switchToChildRoutine(b);
}
}
};
void execute() {
volatile uint64_t x30 = 0;
__asm__ volatile(
"mov %0, x30\n\t"
:"=r"(x30)
:
);
current = &queue[0];
current->endX30 = x30;
__asm__ volatile(
"add x2, sp, #32\n\t"
"str x2, [%0]\n\t"
"str x29, [%1]\n\t"
"ldr x2, [%3]\n\t"
"mov sp, x2\n\t"
"ldr x29, [%4]\n\t"
"ldr x30, [%5]\n\t"
"ret\n\t"
:
:"r"(&queue[0].endSp), "r"(&queue[0].endX29), "r"(&queue[0].endX30),"r"(&queue[0].sp), "r"(&queue[0].x29), "r"(&queue[0].x30)
:"x2", "x30", "memory"
);
}
void release() {
free((uint8_t*)(queue[0].stack));
free((uint8_t*)(queue[1].stack));
}
int main() {
createRoutine(hello);
createMainRoutine(mainRoutineEntry);
execute();
// release();
free((uint8_t*)(queue[0].stack));
free((uint8_t*)(queue[1].stack));
printf("ok, that's right\n");
printf("wow\n");
return 0;
}
Compile and run:
gcc main.c -o main
./main
Run gcc -v, here is the output:
Apple clang version 15.0.0 (clang-1500.1.0.2.5)
Target: arm64-apple-darwin23.3.0
Thread model: posix
InstalledDir: /Library/Developer/CommandLineTools/usr/bin
I run the program many times, sometimes it's right, sometimes it's wrong, and I get an error:
demo(7527,0x1da301c40) malloc: Region cookie corrupted for region 0x159800000 (value is 0)[0x1598081fc]
demo(7527,0x1da301c40) malloc: *** set a breakpoint in malloc_error_break to debug
[1] 7527 abort ./demo
I don't know what it is going on like this.
If you refactor main function like this:
int main() {
createRoutine(hello);
createMainRoutine(mainRoutineEntry);
execute();
release();
// free((uint8_t*)(queue[0].stack));
// free((uint8_t*)(queue[1].stack));
printf("ok, that's right\n");
printf("wow\n");
return 0;
}
It's absolutely failed to run.
I guess that there's something wrong in execute function, I check the code line by line, especially these inline assemble code. It looks all right. x29, sp, x30 registers are loaded or stored in a good manner.On the other hand, I think it might be caused by global variable current and queue.After I refactor them in a routine stack memory, and move to another routine stack memory, the code might not read the newest value of these global variables.Unfortunately, I am not excellent in assemble code, I cannot prove that.
I hope I can run it successfully, and figure out the reason behind the phenomenon.
Thank PeterCordes, I refactor my code and everything is ok, here is the new code:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define STACK_SIZE 1024
typedef struct {
u_int64_t x0;
u_int64_t sp;
u_int64_t x29;
u_int64_t x30;
u_int64_t endAddr;
u_int64_t endSp;
u_int64_t endX29;
u_int64_t endX30;
uint64_t entry;
uint64_t stack;
size_t size;
volatile int dead;
volatile int start;
} Routine;
Routine* queueZero();
Routine* queueOne();
typedef void (*Entry)();
Routine* current;
Routine queue[2];
void switchToRoutine();
void markCurrentDead() {
current->dead = 1;
};
void wrapper(Entry entry) {
entry();
markCurrentDead();
printf("ehy\n");
switchToRoutine();
}
void createRoutine(Entry entry) {
u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE);
if (stack == NULL) {
perror("Failed to allocate stack memory");
exit(EXIT_FAILURE);
}
Routine r;
uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF;
r.sp = aligned_sp;
r.x29 = aligned_sp;
r.entry = (uint64_t)wrapper;
r.x30 = (uint64_t)wrapper;
r.size = STACK_SIZE - 1;
r.dead = 0;
r.stack = (uint64_t)stack;
r.x0 = (uint64_t)entry;
r.start = 0;
queue[1] = r;
}
void execute();
void release();
void mainRoutineEntry();
void createMainRoutine(Entry entry) {
u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE);
if (stack == NULL) {
perror("Failed to allocate stack memory");
exit(EXIT_FAILURE);
}
Routine r;
uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF;
r.sp = aligned_sp;
r.x29 = aligned_sp;
r.entry = (uint64_t)entry;
r.x30 = (uint64_t)entry;
r.size = STACK_SIZE - 1;
r.dead = 0;
r.stack = (uint64_t)stack;
r.x0 = (uint64_t)entry;
queue[0] = r;
}
void create() {
createMainRoutine(mainRoutineEntry);
execute();
release();
}
__attribute__((naked)) void switchToInitChildRoutine() {
__asm__ volatile(
"mov %0, sp\n\t"
"mov %1, x29\n\t"
"mov %2, x30\n\t"
: "=r"(queue[0].sp), "=r"(queue[0].x29), "=r"(queue[0].x30)
:
:"memory"
);
__asm__ volatile(
"ldr x10, [%0]\n\t"
"mov sp, x10\n\t"
"ldr x29, [%1]\n\t"
"ldr x30, [%2]\n\t"
"ldr x0, [%3]\n\t"
"ret\n\t"
:
:"r"(¤t->sp), "r"(¤t->x29), "r"(¤t->x30), "r"(¤t->x0)
:"x10", "x30", "memory", "x0"
);
}
__attribute__((naked)) void switchToChildRoutine() {
__asm__ volatile(
"mov %0, sp\n\t"
"mov %1, x29\n\t"
"mov %2, x30\n\t"
: "=r"(queue[0].sp), "=r"(queue[0].x29), "=r"(queue[0].x30)
:
:"memory"
);
__asm__ volatile(
"mov x10, %0\n\t"
"mov sp, x10\n\t"
"mov x29, %1\n\t"
"mov x30, %2\n\t"
"ret\n\t"
:
:"r"(current->sp), "r"(current->x29), "r"(current->x30)
:"x10", "x30", "memory"
);
}
__attribute__((naked)) void switchToRoutine() {
__asm__ volatile(
"mov x1, sp\n\t"
"mov %0, x1\n\t"
"mov x1, x29\n\t"
"mov %1, x1\n\t"
"mov x1, x30\n\t"
"mov %2, x1\n\t"
: "=r"(current->sp), "=r"(current->x29), "=r"(current->x30)
:
:"memory", "x1"
);
__asm__ volatile(
"mov x10, %0\n\t"
"mov sp, x10\n\t"
"mov x29, %1\n\t"
"mov x30, %2\n\t"
"ret\n\t"
:
:"r"(queue[0].sp), "r"(queue[0].x29), "r"(queue[0].x30)
:"x10", "x30", "memory"
);
}
void hello() {
printf("hello\n");
switchToRoutine();
printf("world\n");
switchToRoutine();
printf("111\n");
}
void mainRoutineEntry() {
while(1) {
// current = queueZero();
current = &queue[0];
printf("enter main routine\n");
Routine* b = &queue[1];
if (b->dead == 1) {
printf("yes\n");
__asm__ volatile(
"mov sp, %0\n\t"
"mov x29, %1\n\t"
"mov x30, %2\n\t"
"ret\n\t"
:
:"r"(current->endSp), "r"(current->endX29), "r"(current->endX30)
);
} else {
current = b;
if (b->start == 0) {
b->start = 1;
switchToInitChildRoutine();
} else {
switchToChildRoutine();
}
}
}
};
__attribute__((naked)) void execute() {
__asm__ volatile(
"mov x2, sp\n\t"
"str x2, [%0]\n\t"
"str x29, [%1]\n\t"
"str x30, [%2]\n\t"
"ldr x2, [%3]\n\t"
"mov sp, x2\n\t"
"ldr x29, [%4]\n\t"
"ldr x30, [%5]\n\t"
"ret\n\t"
:
:"r"(&queue[0].endSp), "r"(&queue[0].endX29), "r"(&queue[0].endX30),"r"(&queue[0].sp), "r"(&queue[0].x29), "r"(&queue[0].x30)
:"x2", "x30", "memory"
);
}
void release() {
free((uint8_t*)(queue[0].stack));
free((uint8_t*)(queue[1].stack));
}
int main() {
createRoutine(hello);
create();
printf("ok, that's right\n");
printf("wow\n");
return 0;
}
You have to compile it by -O1, -O0 not works.
retinside one of the asm statements? That's highly suspicious, and will probably be different at-O0vs.-O3. Normally a context-switch function should be__attribute__((naked))so you write the whole function in assembly, not mixing compiler-generated C with your ownretor modification ofsp(which also isn't officially supported, although you're at least avoiding"m"operands.)executeswitchToRoutineswitchToChildRoutinein__attribute__((naked))style.It really helps me out.But there's a difference.If I compile using-O0, program still cannot work well, instead, I compile using-O1, it works well. It's so mysterious, I don't know why. I still learn something from this problem: Be cautious with inline assemble codes, it's not as you think. By the way, authors who develop coroutine library, are genius, cause they master the assembly code tech and overcome any problems.