runtime instrinsics refactoring using RISC-V custom instruction assmebly directives

This commit is contained in:
Blaise Tine 2021-02-04 15:15:20 -05:00
parent a9f82bceae
commit b047f589d6
44 changed files with 90586 additions and 90486 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -9,7 +9,7 @@ VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

Binary file not shown.

View file

@ -6,451 +6,377 @@ Disassembly of section .init:
80000000 <_start>: 80000000 <_start>:
80000000: 00000597 auipc a1,0x0 80000000: 00000597 auipc a1,0x0
80000004: 0e458593 addi a1,a1,228 # 800000e4 <vx_set_sp> 80000004: 0bc58593 addi a1,a1,188 # 800000bc <vx_set_sp>
80000008: fc102573 csrr a0,0xfc1 80000008: fc102573 csrr a0,0xfc1
8000000c: 00b5106b 0xb5106b 8000000c: 00b5106b 0xb5106b
80000010: 0d4000ef jal ra,800000e4 <vx_set_sp> 80000010: 0ac000ef jal ra,800000bc <vx_set_sp>
80000014: 00100513 li a0,1 80000014: 00100513 li a0,1
80000018: 0005006b 0x5006b 80000018: 0005006b 0x5006b
8000001c: 00002517 auipc a0,0x2 8000001c: 00002517 auipc a0,0x2
80000020: 99050513 addi a0,a0,-1648 # 800019ac <__BSS_END__> 80000020: 8e850513 addi a0,a0,-1816 # 80001904 <__BSS_END__>
80000024: 00002617 auipc a2,0x2 80000024: 00002617 auipc a2,0x2
80000028: 98860613 addi a2,a2,-1656 # 800019ac <__BSS_END__> 80000028: 8e060613 addi a2,a2,-1824 # 80001904 <__BSS_END__>
8000002c: 40a60633 sub a2,a2,a0 8000002c: 40a60633 sub a2,a2,a0
80000030: 00000593 li a1,0 80000030: 00000593 li a1,0
80000034: 2ac000ef jal ra,800002e0 <memset> 80000034: 204000ef jal ra,80000238 <memset>
80000038: 00000517 auipc a0,0x0 80000038: 00000517 auipc a0,0x0
8000003c: 1b050513 addi a0,a0,432 # 800001e8 <__libc_fini_array> 8000003c: 10850513 addi a0,a0,264 # 80000140 <__libc_fini_array>
80000040: 160000ef jal ra,800001a0 <atexit> 80000040: 0b8000ef jal ra,800000f8 <atexit>
80000044: 200000ef jal ra,80000244 <__libc_init_array> 80000044: 158000ef jal ra,8000019c <__libc_init_array>
80000048: 008000ef jal ra,80000050 <main> 80000048: 008000ef jal ra,80000050 <main>
8000004c: 1680006f j 800001b4 <exit> 8000004c: 0c00006f j 8000010c <exit>
Disassembly of section .text: Disassembly of section .text:
80000050 <main>: 80000050 <main>:
80000050: ff010113 addi sp,sp,-16 80000050: 7ffff7b7 lui a5,0x7ffff
80000054: 7ffff7b7 lui a5,0x7ffff 80000054: 0007a703 lw a4,0(a5) # 7ffff000 <__stack_size+0x7fffec00>
80000058: 00812423 sw s0,8(sp) 80000058: 0047a683 lw a3,4(a5)
8000005c: 0007a403 lw s0,0(a5) # 7ffff000 <__stack_size+0x7fffec00> 8000005c: 0087a583 lw a1,8(a5)
80000060: 00912223 sw s1,4(sp) 80000060: cc5027f3 csrr a5,0xcc5
80000064: 01212023 sw s2,0(sp) 80000064: 02e787b3 mul a5,a5,a4
80000068: 0087a483 lw s1,8(a5) 80000068: 02070863 beqz a4,80000098 <main+0x48>
8000006c: 0047a903 lw s2,4(a5) 8000006c: 00f70733 add a4,a4,a5
80000070: 00112623 sw ra,12(sp) 80000070: 00271713 slli a4,a4,0x2
80000074: 0fc000ef jal ra,80000170 <vx_core_id> 80000074: 00279793 slli a5,a5,0x2
80000078: 02850533 mul a0,a0,s0 80000078: 00d787b3 add a5,a5,a3
8000007c: 02040863 beqz s0,800000ac <main+0x5c> 8000007c: 00d70733 add a4,a4,a3
80000080: 00a40733 add a4,s0,a0 80000080: 40d585b3 sub a1,a1,a3
80000084: 00271713 slli a4,a4,0x2 80000084: 0007a603 lw a2,0(a5)
80000088: 00251513 slli a0,a0,0x2 80000088: 00f586b3 add a3,a1,a5
8000008c: 012507b3 add a5,a0,s2 8000008c: 00478793 addi a5,a5,4
80000090: 01270733 add a4,a4,s2 80000090: 00c6a023 sw a2,0(a3)
80000094: 412485b3 sub a1,s1,s2 80000094: fef718e3 bne a4,a5,80000084 <main+0x34>
80000098: 0007a603 lw a2,0(a5) 80000098: 00008067 ret
8000009c: 00f586b3 add a3,a1,a5
800000a0: 00478793 addi a5,a5,4
800000a4: 00c6a023 sw a2,0(a3)
800000a8: fef718e3 bne a4,a5,80000098 <main+0x48>
800000ac: 00c12083 lw ra,12(sp)
800000b0: 00812403 lw s0,8(sp)
800000b4: 00412483 lw s1,4(sp)
800000b8: 00012903 lw s2,0(sp)
800000bc: 01010113 addi sp,sp,16
800000c0: 00008067 ret
800000c4 <register_fini>: 8000009c <register_fini>:
800000c4: 00000793 li a5,0 8000009c: 00000793 li a5,0
800000c8: 00078863 beqz a5,800000d8 <register_fini+0x14> 800000a0: 00078863 beqz a5,800000b0 <register_fini+0x14>
800000cc: 80000537 lui a0,0x80000 800000a4: 80000537 lui a0,0x80000
800000d0: 1e850513 addi a0,a0,488 # 800001e8 <__stack_top+0x810001e8> 800000a8: 14050513 addi a0,a0,320 # 80000140 <__stack_top+0x81000140>
800000d4: 0cc0006f j 800001a0 <atexit> 800000ac: 04c0006f j 800000f8 <atexit>
800000d8: 00008067 ret 800000b0: 00008067 ret
800000dc <_exit>: 800000b4 <_exit>:
800000dc: 00000513 li a0,0 800000b4: 00000513 li a0,0
800000e0: 0005006b 0x5006b 800000b8: 0005006b 0x5006b
800000e4 <vx_set_sp>: 800000bc <vx_set_sp>:
800000e4: fc002573 csrr a0,0xfc0 800000bc: fc002573 csrr a0,0xfc0
800000e8: 0005006b 0x5006b 800000c0: 0005006b 0x5006b
800000ec: 00002197 auipc gp,0x2 800000c4: 00002197 auipc gp,0x2
800000f0: c9418193 addi gp,gp,-876 # 80001d80 <__global_pointer> 800000c8: c1418193 addi gp,gp,-1004 # 80001cd8 <__global_pointer>
800000f4: 7f000117 auipc sp,0x7f000 800000cc: 7f000117 auipc sp,0x7f000
800000f8: f0c10113 addi sp,sp,-244 # ff000000 <__stack_top> 800000d0: f3410113 addi sp,sp,-204 # ff000000 <__stack_top>
800000fc: 40000593 li a1,1024 800000d4: 40000593 li a1,1024
80000100: cc102673 csrr a2,0xcc1 800000d8: cc102673 csrr a2,0xcc1
80000104: 02c585b3 mul a1,a1,a2 800000dc: 02c585b3 mul a1,a1,a2
80000108: 40b10133 sub sp,sp,a1 800000e0: 40b10133 sub sp,sp,a1
8000010c: cc3026f3 csrr a3,0xcc3 800000e4: cc3026f3 csrr a3,0xcc3
80000110: 00068663 beqz a3,8000011c <RETURN> 800000e8: 00068663 beqz a3,800000f4 <RETURN>
80000114: 00000513 li a0,0 800000ec: 00000513 li a0,0
80000118: 0005006b 0x5006b 800000f0: 0005006b 0x5006b
8000011c <RETURN>: 800000f4 <RETURN>:
8000011c: 00008067 ret 800000f4: 00008067 ret
80000120 <vx_wspawn>: 800000f8 <atexit>:
80000120: 00b5106b 0xb5106b 800000f8: 00050593 mv a1,a0
80000124: 00008067 ret 800000fc: 00000693 li a3,0
80000100: 00000613 li a2,0
80000104: 00000513 li a0,0
80000108: 20c0006f j 80000314 <__register_exitproc>
80000128 <vx_tmc>: 8000010c <exit>:
80000128: 0005006b 0x5006b 8000010c: ff010113 addi sp,sp,-16
8000012c: 00008067 ret 80000110: 00000593 li a1,0
80000114: 00812423 sw s0,8(sp)
80000118: 00112623 sw ra,12(sp)
8000011c: 00050413 mv s0,a0
80000120: 290000ef jal ra,800003b0 <__call_exitprocs>
80000124: 800027b7 lui a5,0x80002
80000128: 9007a503 lw a0,-1792(a5) # 80001900 <__stack_top+0x81001900>
8000012c: 03c52783 lw a5,60(a0)
80000130: 00078463 beqz a5,80000138 <exit+0x2c>
80000134: 000780e7 jalr a5
80000138: 00040513 mv a0,s0
8000013c: f79ff0ef jal ra,800000b4 <_exit>
80000130 <vx_barrier>: 80000140 <__libc_fini_array>:
80000130: 00b5406b 0xb5406b 80000140: ff010113 addi sp,sp,-16
80000134: 00008067 ret 80000144: 00812423 sw s0,8(sp)
80000148: 800017b7 lui a5,0x80001
8000014c: 80001437 lui s0,0x80001
80000150: 4d840413 addi s0,s0,1240 # 800014d8 <__stack_top+0x810014d8>
80000154: 4d878793 addi a5,a5,1240 # 800014d8 <__stack_top+0x810014d8>
80000158: 408787b3 sub a5,a5,s0
8000015c: 00912223 sw s1,4(sp)
80000160: 00112623 sw ra,12(sp)
80000164: 4027d493 srai s1,a5,0x2
80000168: 02048063 beqz s1,80000188 <__libc_fini_array+0x48>
8000016c: ffc78793 addi a5,a5,-4
80000170: 00878433 add s0,a5,s0
80000174: 00042783 lw a5,0(s0)
80000178: fff48493 addi s1,s1,-1
8000017c: ffc40413 addi s0,s0,-4
80000180: 000780e7 jalr a5
80000184: fe0498e3 bnez s1,80000174 <__libc_fini_array+0x34>
80000188: 00c12083 lw ra,12(sp)
8000018c: 00812403 lw s0,8(sp)
80000190: 00412483 lw s1,4(sp)
80000194: 01010113 addi sp,sp,16
80000198: 00008067 ret
80000138 <vx_split>: 8000019c <__libc_init_array>:
80000138: 0005206b 0x5206b 8000019c: ff010113 addi sp,sp,-16
8000013c: 00008067 ret 800001a0: 00812423 sw s0,8(sp)
800001a4: 01212023 sw s2,0(sp)
800001a8: 80001437 lui s0,0x80001
800001ac: 80001937 lui s2,0x80001
800001b0: 4d440793 addi a5,s0,1236 # 800014d4 <__stack_top+0x810014d4>
800001b4: 4d490913 addi s2,s2,1236 # 800014d4 <__stack_top+0x810014d4>
800001b8: 40f90933 sub s2,s2,a5
800001bc: 00112623 sw ra,12(sp)
800001c0: 00912223 sw s1,4(sp)
800001c4: 40295913 srai s2,s2,0x2
800001c8: 02090063 beqz s2,800001e8 <__libc_init_array+0x4c>
800001cc: 4d440413 addi s0,s0,1236
800001d0: 00000493 li s1,0
800001d4: 00042783 lw a5,0(s0)
800001d8: 00148493 addi s1,s1,1
800001dc: 00440413 addi s0,s0,4
800001e0: 000780e7 jalr a5
800001e4: fe9918e3 bne s2,s1,800001d4 <__libc_init_array+0x38>
800001e8: 80001437 lui s0,0x80001
800001ec: 80001937 lui s2,0x80001
800001f0: 4d440793 addi a5,s0,1236 # 800014d4 <__stack_top+0x810014d4>
800001f4: 4d890913 addi s2,s2,1240 # 800014d8 <__stack_top+0x810014d8>
800001f8: 40f90933 sub s2,s2,a5
800001fc: 40295913 srai s2,s2,0x2
80000200: 02090063 beqz s2,80000220 <__libc_init_array+0x84>
80000204: 4d440413 addi s0,s0,1236
80000208: 00000493 li s1,0
8000020c: 00042783 lw a5,0(s0)
80000210: 00148493 addi s1,s1,1
80000214: 00440413 addi s0,s0,4
80000218: 000780e7 jalr a5
8000021c: fe9918e3 bne s2,s1,8000020c <__libc_init_array+0x70>
80000220: 00c12083 lw ra,12(sp)
80000224: 00812403 lw s0,8(sp)
80000228: 00412483 lw s1,4(sp)
8000022c: 00012903 lw s2,0(sp)
80000230: 01010113 addi sp,sp,16
80000234: 00008067 ret
80000140 <vx_join>: 80000238 <memset>:
80000140: 0000306b 0x306b 80000238: 00f00313 li t1,15
80000144: 00008067 ret 8000023c: 00050713 mv a4,a0
80000240: 02c37e63 bgeu t1,a2,8000027c <memset+0x44>
80000244: 00f77793 andi a5,a4,15
80000248: 0a079063 bnez a5,800002e8 <memset+0xb0>
8000024c: 08059263 bnez a1,800002d0 <memset+0x98>
80000250: ff067693 andi a3,a2,-16
80000254: 00f67613 andi a2,a2,15
80000258: 00e686b3 add a3,a3,a4
8000025c: 00b72023 sw a1,0(a4)
80000260: 00b72223 sw a1,4(a4)
80000264: 00b72423 sw a1,8(a4)
80000268: 00b72623 sw a1,12(a4)
8000026c: 01070713 addi a4,a4,16
80000270: fed766e3 bltu a4,a3,8000025c <memset+0x24>
80000274: 00061463 bnez a2,8000027c <memset+0x44>
80000278: 00008067 ret
8000027c: 40c306b3 sub a3,t1,a2
80000280: 00269693 slli a3,a3,0x2
80000284: 00000297 auipc t0,0x0
80000288: 005686b3 add a3,a3,t0
8000028c: 00c68067 jr 12(a3)
80000290: 00b70723 sb a1,14(a4)
80000294: 00b706a3 sb a1,13(a4)
80000298: 00b70623 sb a1,12(a4)
8000029c: 00b705a3 sb a1,11(a4)
800002a0: 00b70523 sb a1,10(a4)
800002a4: 00b704a3 sb a1,9(a4)
800002a8: 00b70423 sb a1,8(a4)
800002ac: 00b703a3 sb a1,7(a4)
800002b0: 00b70323 sb a1,6(a4)
800002b4: 00b702a3 sb a1,5(a4)
800002b8: 00b70223 sb a1,4(a4)
800002bc: 00b701a3 sb a1,3(a4)
800002c0: 00b70123 sb a1,2(a4)
800002c4: 00b700a3 sb a1,1(a4)
800002c8: 00b70023 sb a1,0(a4)
800002cc: 00008067 ret
800002d0: 0ff5f593 andi a1,a1,255
800002d4: 00859693 slli a3,a1,0x8
800002d8: 00d5e5b3 or a1,a1,a3
800002dc: 01059693 slli a3,a1,0x10
800002e0: 00d5e5b3 or a1,a1,a3
800002e4: f6dff06f j 80000250 <memset+0x18>
800002e8: 00279693 slli a3,a5,0x2
800002ec: 00000297 auipc t0,0x0
800002f0: 005686b3 add a3,a3,t0
800002f4: 00008293 mv t0,ra
800002f8: fa0680e7 jalr -96(a3)
800002fc: 00028093 mv ra,t0
80000300: ff078793 addi a5,a5,-16
80000304: 40f70733 sub a4,a4,a5
80000308: 00f60633 add a2,a2,a5
8000030c: f6c378e3 bgeu t1,a2,8000027c <memset+0x44>
80000310: f3dff06f j 8000024c <memset+0x14>
80000148 <vx_warp_id>: 80000314 <__register_exitproc>:
80000148: cc302573 csrr a0,0xcc3 80000314: 800027b7 lui a5,0x80002
8000014c: 00008067 ret 80000318: 9007a703 lw a4,-1792(a5) # 80001900 <__stack_top+0x81001900>
8000031c: 14872783 lw a5,328(a4)
80000150 <vx_warp_gid>: 80000320: 04078c63 beqz a5,80000378 <__register_exitproc+0x64>
80000150: f1402573 csrr a0,mhartid 80000324: 0047a703 lw a4,4(a5)
80000154: 00008067 ret 80000328: 01f00813 li a6,31
8000032c: 06e84e63 blt a6,a4,800003a8 <__register_exitproc+0x94>
80000158 <vx_thread_id>: 80000330: 00271813 slli a6,a4,0x2
80000158: cc002573 csrr a0,0xcc0 80000334: 02050663 beqz a0,80000360 <__register_exitproc+0x4c>
8000015c: 00008067 ret 80000338: 01078333 add t1,a5,a6
8000033c: 08c32423 sw a2,136(t1)
80000160 <vx_thread_lid>: 80000340: 1887a883 lw a7,392(a5)
80000160: cc102573 csrr a0,0xcc1 80000344: 00100613 li a2,1
80000164: 00008067 ret 80000348: 00e61633 sll a2,a2,a4
8000034c: 00c8e8b3 or a7,a7,a2
80000168 <vx_thread_gid>: 80000350: 1917a423 sw a7,392(a5)
80000168: cc202573 csrr a0,0xcc2 80000354: 10d32423 sw a3,264(t1)
8000016c: 00008067 ret 80000358: 00200693 li a3,2
8000035c: 02d50463 beq a0,a3,80000384 <__register_exitproc+0x70>
80000170 <vx_core_id>: 80000360: 00170713 addi a4,a4,1
80000170: cc502573 csrr a0,0xcc5 80000364: 00e7a223 sw a4,4(a5)
80000174: 00008067 ret 80000368: 010787b3 add a5,a5,a6
8000036c: 00b7a423 sw a1,8(a5)
80000178 <vx_num_threads>: 80000370: 00000513 li a0,0
80000178: fc002573 csrr a0,0xfc0
8000017c: 00008067 ret
80000180 <vx_num_warps>:
80000180: fc102573 csrr a0,0xfc1
80000184: 00008067 ret
80000188 <vx_num_cores>:
80000188: fc202573 csrr a0,0xfc2
8000018c: 00008067 ret
80000190 <vx_num_cycles>:
80000190: c0002573 rdcycle a0
80000194: 00008067 ret
80000198 <vx_num_instrs>:
80000198: c0202573 rdinstret a0
8000019c: 00008067 ret
800001a0 <atexit>:
800001a0: 00050593 mv a1,a0
800001a4: 00000693 li a3,0
800001a8: 00000613 li a2,0
800001ac: 00000513 li a0,0
800001b0: 20c0006f j 800003bc <__register_exitproc>
800001b4 <exit>:
800001b4: ff010113 addi sp,sp,-16
800001b8: 00000593 li a1,0
800001bc: 00812423 sw s0,8(sp)
800001c0: 00112623 sw ra,12(sp)
800001c4: 00050413 mv s0,a0
800001c8: 290000ef jal ra,80000458 <__call_exitprocs>
800001cc: 800027b7 lui a5,0x80002
800001d0: 9a87a503 lw a0,-1624(a5) # 800019a8 <__stack_top+0x810019a8>
800001d4: 03c52783 lw a5,60(a0)
800001d8: 00078463 beqz a5,800001e0 <exit+0x2c>
800001dc: 000780e7 jalr a5
800001e0: 00040513 mv a0,s0
800001e4: ef9ff0ef jal ra,800000dc <_exit>
800001e8 <__libc_fini_array>:
800001e8: ff010113 addi sp,sp,-16
800001ec: 00812423 sw s0,8(sp)
800001f0: 800017b7 lui a5,0x80001
800001f4: 80001437 lui s0,0x80001
800001f8: 58040413 addi s0,s0,1408 # 80001580 <__stack_top+0x81001580>
800001fc: 58078793 addi a5,a5,1408 # 80001580 <__stack_top+0x81001580>
80000200: 408787b3 sub a5,a5,s0
80000204: 00912223 sw s1,4(sp)
80000208: 00112623 sw ra,12(sp)
8000020c: 4027d493 srai s1,a5,0x2
80000210: 02048063 beqz s1,80000230 <__libc_fini_array+0x48>
80000214: ffc78793 addi a5,a5,-4
80000218: 00878433 add s0,a5,s0
8000021c: 00042783 lw a5,0(s0)
80000220: fff48493 addi s1,s1,-1
80000224: ffc40413 addi s0,s0,-4
80000228: 000780e7 jalr a5
8000022c: fe0498e3 bnez s1,8000021c <__libc_fini_array+0x34>
80000230: 00c12083 lw ra,12(sp)
80000234: 00812403 lw s0,8(sp)
80000238: 00412483 lw s1,4(sp)
8000023c: 01010113 addi sp,sp,16
80000240: 00008067 ret
80000244 <__libc_init_array>:
80000244: ff010113 addi sp,sp,-16
80000248: 00812423 sw s0,8(sp)
8000024c: 01212023 sw s2,0(sp)
80000250: 80001437 lui s0,0x80001
80000254: 80001937 lui s2,0x80001
80000258: 57c40793 addi a5,s0,1404 # 8000157c <__stack_top+0x8100157c>
8000025c: 57c90913 addi s2,s2,1404 # 8000157c <__stack_top+0x8100157c>
80000260: 40f90933 sub s2,s2,a5
80000264: 00112623 sw ra,12(sp)
80000268: 00912223 sw s1,4(sp)
8000026c: 40295913 srai s2,s2,0x2
80000270: 02090063 beqz s2,80000290 <__libc_init_array+0x4c>
80000274: 57c40413 addi s0,s0,1404
80000278: 00000493 li s1,0
8000027c: 00042783 lw a5,0(s0)
80000280: 00148493 addi s1,s1,1
80000284: 00440413 addi s0,s0,4
80000288: 000780e7 jalr a5
8000028c: fe9918e3 bne s2,s1,8000027c <__libc_init_array+0x38>
80000290: 80001437 lui s0,0x80001
80000294: 80001937 lui s2,0x80001
80000298: 57c40793 addi a5,s0,1404 # 8000157c <__stack_top+0x8100157c>
8000029c: 58090913 addi s2,s2,1408 # 80001580 <__stack_top+0x81001580>
800002a0: 40f90933 sub s2,s2,a5
800002a4: 40295913 srai s2,s2,0x2
800002a8: 02090063 beqz s2,800002c8 <__libc_init_array+0x84>
800002ac: 57c40413 addi s0,s0,1404
800002b0: 00000493 li s1,0
800002b4: 00042783 lw a5,0(s0)
800002b8: 00148493 addi s1,s1,1
800002bc: 00440413 addi s0,s0,4
800002c0: 000780e7 jalr a5
800002c4: fe9918e3 bne s2,s1,800002b4 <__libc_init_array+0x70>
800002c8: 00c12083 lw ra,12(sp)
800002cc: 00812403 lw s0,8(sp)
800002d0: 00412483 lw s1,4(sp)
800002d4: 00012903 lw s2,0(sp)
800002d8: 01010113 addi sp,sp,16
800002dc: 00008067 ret
800002e0 <memset>:
800002e0: 00f00313 li t1,15
800002e4: 00050713 mv a4,a0
800002e8: 02c37e63 bgeu t1,a2,80000324 <memset+0x44>
800002ec: 00f77793 andi a5,a4,15
800002f0: 0a079063 bnez a5,80000390 <memset+0xb0>
800002f4: 08059263 bnez a1,80000378 <memset+0x98>
800002f8: ff067693 andi a3,a2,-16
800002fc: 00f67613 andi a2,a2,15
80000300: 00e686b3 add a3,a3,a4
80000304: 00b72023 sw a1,0(a4)
80000308: 00b72223 sw a1,4(a4)
8000030c: 00b72423 sw a1,8(a4)
80000310: 00b72623 sw a1,12(a4)
80000314: 01070713 addi a4,a4,16
80000318: fed766e3 bltu a4,a3,80000304 <memset+0x24>
8000031c: 00061463 bnez a2,80000324 <memset+0x44>
80000320: 00008067 ret
80000324: 40c306b3 sub a3,t1,a2
80000328: 00269693 slli a3,a3,0x2
8000032c: 00000297 auipc t0,0x0
80000330: 005686b3 add a3,a3,t0
80000334: 00c68067 jr 12(a3)
80000338: 00b70723 sb a1,14(a4)
8000033c: 00b706a3 sb a1,13(a4)
80000340: 00b70623 sb a1,12(a4)
80000344: 00b705a3 sb a1,11(a4)
80000348: 00b70523 sb a1,10(a4)
8000034c: 00b704a3 sb a1,9(a4)
80000350: 00b70423 sb a1,8(a4)
80000354: 00b703a3 sb a1,7(a4)
80000358: 00b70323 sb a1,6(a4)
8000035c: 00b702a3 sb a1,5(a4)
80000360: 00b70223 sb a1,4(a4)
80000364: 00b701a3 sb a1,3(a4)
80000368: 00b70123 sb a1,2(a4)
8000036c: 00b700a3 sb a1,1(a4)
80000370: 00b70023 sb a1,0(a4)
80000374: 00008067 ret 80000374: 00008067 ret
80000378: 0ff5f593 andi a1,a1,255 80000378: 14c70793 addi a5,a4,332
8000037c: 00859693 slli a3,a1,0x8 8000037c: 14f72423 sw a5,328(a4)
80000380: 00d5e5b3 or a1,a1,a3 80000380: fa5ff06f j 80000324 <__register_exitproc+0x10>
80000384: 01059693 slli a3,a1,0x10 80000384: 18c7a683 lw a3,396(a5)
80000388: 00d5e5b3 or a1,a1,a3 80000388: 00170713 addi a4,a4,1
8000038c: f6dff06f j 800002f8 <memset+0x18> 8000038c: 00e7a223 sw a4,4(a5)
80000390: 00279693 slli a3,a5,0x2 80000390: 00c6e633 or a2,a3,a2
80000394: 00000297 auipc t0,0x0 80000394: 18c7a623 sw a2,396(a5)
80000398: 005686b3 add a3,a3,t0 80000398: 010787b3 add a5,a5,a6
8000039c: 00008293 mv t0,ra 8000039c: 00b7a423 sw a1,8(a5)
800003a0: fa0680e7 jalr -96(a3) 800003a0: 00000513 li a0,0
800003a4: 00028093 mv ra,t0 800003a4: 00008067 ret
800003a8: ff078793 addi a5,a5,-16 800003a8: fff00513 li a0,-1
800003ac: 40f70733 sub a4,a4,a5 800003ac: 00008067 ret
800003b0: 00f60633 add a2,a2,a5
800003b4: f6c378e3 bgeu t1,a2,80000324 <memset+0x44>
800003b8: f3dff06f j 800002f4 <memset+0x14>
800003bc <__register_exitproc>: 800003b0 <__call_exitprocs>:
800003bc: 800027b7 lui a5,0x80002 800003b0: fd010113 addi sp,sp,-48
800003c0: 9a87a703 lw a4,-1624(a5) # 800019a8 <__stack_top+0x810019a8> 800003b4: 800027b7 lui a5,0x80002
800003c4: 14872783 lw a5,328(a4) 800003b8: 01412c23 sw s4,24(sp)
800003c8: 04078c63 beqz a5,80000420 <__register_exitproc+0x64> 800003bc: 9007aa03 lw s4,-1792(a5) # 80001900 <__stack_top+0x81001900>
800003cc: 0047a703 lw a4,4(a5) 800003c0: 03212023 sw s2,32(sp)
800003d0: 01f00813 li a6,31 800003c4: 02112623 sw ra,44(sp)
800003d4: 06e84e63 blt a6,a4,80000450 <__register_exitproc+0x94> 800003c8: 148a2903 lw s2,328(s4)
800003d8: 00271813 slli a6,a4,0x2 800003cc: 02812423 sw s0,40(sp)
800003dc: 02050663 beqz a0,80000408 <__register_exitproc+0x4c> 800003d0: 02912223 sw s1,36(sp)
800003e0: 01078333 add t1,a5,a6 800003d4: 01312e23 sw s3,28(sp)
800003e4: 08c32423 sw a2,136(t1) 800003d8: 01512a23 sw s5,20(sp)
800003e8: 1887a883 lw a7,392(a5) 800003dc: 01612823 sw s6,16(sp)
800003ec: 00100613 li a2,1 800003e0: 01712623 sw s7,12(sp)
800003f0: 00e61633 sll a2,a2,a4 800003e4: 01812423 sw s8,8(sp)
800003f4: 00c8e8b3 or a7,a7,a2 800003e8: 04090063 beqz s2,80000428 <__call_exitprocs+0x78>
800003f8: 1917a423 sw a7,392(a5) 800003ec: 00050b13 mv s6,a0
800003fc: 10d32423 sw a3,264(t1) 800003f0: 00058b93 mv s7,a1
80000400: 00200693 li a3,2 800003f4: 00100a93 li s5,1
80000404: 02d50463 beq a0,a3,8000042c <__register_exitproc+0x70> 800003f8: fff00993 li s3,-1
80000408: 00170713 addi a4,a4,1 800003fc: 00492483 lw s1,4(s2)
8000040c: 00e7a223 sw a4,4(a5) 80000400: fff48413 addi s0,s1,-1
80000410: 010787b3 add a5,a5,a6 80000404: 02044263 bltz s0,80000428 <__call_exitprocs+0x78>
80000414: 00b7a423 sw a1,8(a5) 80000408: 00249493 slli s1,s1,0x2
80000418: 00000513 li a0,0 8000040c: 009904b3 add s1,s2,s1
8000041c: 00008067 ret 80000410: 040b8463 beqz s7,80000458 <__call_exitprocs+0xa8>
80000420: 14c70793 addi a5,a4,332 80000414: 1044a783 lw a5,260(s1)
80000424: 14f72423 sw a5,328(a4) 80000418: 05778063 beq a5,s7,80000458 <__call_exitprocs+0xa8>
80000428: fa5ff06f j 800003cc <__register_exitproc+0x10> 8000041c: fff40413 addi s0,s0,-1
8000042c: 18c7a683 lw a3,396(a5) 80000420: ffc48493 addi s1,s1,-4
80000430: 00170713 addi a4,a4,1 80000424: ff3416e3 bne s0,s3,80000410 <__call_exitprocs+0x60>
80000434: 00e7a223 sw a4,4(a5) 80000428: 02c12083 lw ra,44(sp)
80000438: 00c6e633 or a2,a3,a2 8000042c: 02812403 lw s0,40(sp)
8000043c: 18c7a623 sw a2,396(a5) 80000430: 02412483 lw s1,36(sp)
80000440: 010787b3 add a5,a5,a6 80000434: 02012903 lw s2,32(sp)
80000444: 00b7a423 sw a1,8(a5) 80000438: 01c12983 lw s3,28(sp)
80000448: 00000513 li a0,0 8000043c: 01812a03 lw s4,24(sp)
8000044c: 00008067 ret 80000440: 01412a83 lw s5,20(sp)
80000450: fff00513 li a0,-1 80000444: 01012b03 lw s6,16(sp)
80000448: 00c12b83 lw s7,12(sp)
8000044c: 00812c03 lw s8,8(sp)
80000450: 03010113 addi sp,sp,48
80000454: 00008067 ret 80000454: 00008067 ret
80000458: 00492783 lw a5,4(s2)
80000458 <__call_exitprocs>: 8000045c: 0044a683 lw a3,4(s1)
80000458: fd010113 addi sp,sp,-48 80000460: fff78793 addi a5,a5,-1
8000045c: 800027b7 lui a5,0x80002 80000464: 04878e63 beq a5,s0,800004c0 <__call_exitprocs+0x110>
80000460: 01412c23 sw s4,24(sp) 80000468: 0004a223 sw zero,4(s1)
80000464: 9a87aa03 lw s4,-1624(a5) # 800019a8 <__stack_top+0x810019a8> 8000046c: fa0688e3 beqz a3,8000041c <__call_exitprocs+0x6c>
80000468: 03212023 sw s2,32(sp) 80000470: 18892783 lw a5,392(s2)
8000046c: 02112623 sw ra,44(sp) 80000474: 008a9733 sll a4,s5,s0
80000470: 148a2903 lw s2,328(s4) 80000478: 00492c03 lw s8,4(s2)
80000474: 02812423 sw s0,40(sp) 8000047c: 00f777b3 and a5,a4,a5
80000478: 02912223 sw s1,36(sp) 80000480: 02079263 bnez a5,800004a4 <__call_exitprocs+0xf4>
8000047c: 01312e23 sw s3,28(sp) 80000484: 000680e7 jalr a3
80000480: 01512a23 sw s5,20(sp) 80000488: 00492703 lw a4,4(s2)
80000484: 01612823 sw s6,16(sp) 8000048c: 148a2783 lw a5,328(s4)
80000488: 01712623 sw s7,12(sp) 80000490: 01871463 bne a4,s8,80000498 <__call_exitprocs+0xe8>
8000048c: 01812423 sw s8,8(sp) 80000494: f8f904e3 beq s2,a5,8000041c <__call_exitprocs+0x6c>
80000490: 04090063 beqz s2,800004d0 <__call_exitprocs+0x78> 80000498: f80788e3 beqz a5,80000428 <__call_exitprocs+0x78>
80000494: 00050b13 mv s6,a0 8000049c: 00078913 mv s2,a5
80000498: 00058b93 mv s7,a1 800004a0: f5dff06f j 800003fc <__call_exitprocs+0x4c>
8000049c: 00100a93 li s5,1 800004a4: 18c92783 lw a5,396(s2)
800004a0: fff00993 li s3,-1 800004a8: 0844a583 lw a1,132(s1)
800004a4: 00492483 lw s1,4(s2) 800004ac: 00f77733 and a4,a4,a5
800004a8: fff48413 addi s0,s1,-1 800004b0: 00071c63 bnez a4,800004c8 <__call_exitprocs+0x118>
800004ac: 02044263 bltz s0,800004d0 <__call_exitprocs+0x78> 800004b4: 000b0513 mv a0,s6
800004b0: 00249493 slli s1,s1,0x2 800004b8: 000680e7 jalr a3
800004b4: 009904b3 add s1,s2,s1 800004bc: fcdff06f j 80000488 <__call_exitprocs+0xd8>
800004b8: 040b8463 beqz s7,80000500 <__call_exitprocs+0xa8> 800004c0: 00892223 sw s0,4(s2)
800004bc: 1044a783 lw a5,260(s1) 800004c4: fa9ff06f j 8000046c <__call_exitprocs+0xbc>
800004c0: 05778063 beq a5,s7,80000500 <__call_exitprocs+0xa8> 800004c8: 00058513 mv a0,a1
800004c4: fff40413 addi s0,s0,-1 800004cc: 000680e7 jalr a3
800004c8: ffc48493 addi s1,s1,-4 800004d0: fb9ff06f j 80000488 <__call_exitprocs+0xd8>
800004cc: ff3416e3 bne s0,s3,800004b8 <__call_exitprocs+0x60>
800004d0: 02c12083 lw ra,44(sp)
800004d4: 02812403 lw s0,40(sp)
800004d8: 02412483 lw s1,36(sp)
800004dc: 02012903 lw s2,32(sp)
800004e0: 01c12983 lw s3,28(sp)
800004e4: 01812a03 lw s4,24(sp)
800004e8: 01412a83 lw s5,20(sp)
800004ec: 01012b03 lw s6,16(sp)
800004f0: 00c12b83 lw s7,12(sp)
800004f4: 00812c03 lw s8,8(sp)
800004f8: 03010113 addi sp,sp,48
800004fc: 00008067 ret
80000500: 00492783 lw a5,4(s2)
80000504: 0044a683 lw a3,4(s1)
80000508: fff78793 addi a5,a5,-1
8000050c: 04878e63 beq a5,s0,80000568 <__call_exitprocs+0x110>
80000510: 0004a223 sw zero,4(s1)
80000514: fa0688e3 beqz a3,800004c4 <__call_exitprocs+0x6c>
80000518: 18892783 lw a5,392(s2)
8000051c: 008a9733 sll a4,s5,s0
80000520: 00492c03 lw s8,4(s2)
80000524: 00f777b3 and a5,a4,a5
80000528: 02079263 bnez a5,8000054c <__call_exitprocs+0xf4>
8000052c: 000680e7 jalr a3
80000530: 00492703 lw a4,4(s2)
80000534: 148a2783 lw a5,328(s4)
80000538: 01871463 bne a4,s8,80000540 <__call_exitprocs+0xe8>
8000053c: f8f904e3 beq s2,a5,800004c4 <__call_exitprocs+0x6c>
80000540: f80788e3 beqz a5,800004d0 <__call_exitprocs+0x78>
80000544: 00078913 mv s2,a5
80000548: f5dff06f j 800004a4 <__call_exitprocs+0x4c>
8000054c: 18c92783 lw a5,396(s2)
80000550: 0844a583 lw a1,132(s1)
80000554: 00f77733 and a4,a4,a5
80000558: 00071c63 bnez a4,80000570 <__call_exitprocs+0x118>
8000055c: 000b0513 mv a0,s6
80000560: 000680e7 jalr a3
80000564: fcdff06f j 80000530 <__call_exitprocs+0xd8>
80000568: 00892223 sw s0,4(s2)
8000056c: fa9ff06f j 80000514 <__call_exitprocs+0xbc>
80000570: 00058513 mv a0,a1
80000574: 000680e7 jalr a3
80000578: fb9ff06f j 80000530 <__call_exitprocs+0xd8>
Disassembly of section .init_array: Disassembly of section .init_array:
8000157c <__init_array_start>: 800014d4 <__init_array_start>:
8000157c: 00c4 addi s1,sp,68 800014d4: 009c addi a5,sp,64
8000157e: 8000 0x8000 800014d6: 8000 0x8000
Disassembly of section .data: Disassembly of section .data:
80001580 <impure_data>: 800014d8 <impure_data>:
80001580: 0000 unimp 800014d8: 0000 unimp
80001582: 0000 unimp 800014da: 0000 unimp
80001584: 186c addi a1,sp,60 800014dc: 17c4 addi s1,sp,996
80001586: 8000 0x8000 800014de: 8000 0x8000
80001588: 18d4 addi a3,sp,116 800014e0: 182c addi a1,sp,56
8000158a: 8000 0x8000 800014e2: 8000 0x8000
8000158c: 193c addi a5,sp,184 800014e4: 1894 addi a3,sp,112
8000158e: 8000 0x8000 800014e6: 8000 0x8000
... ...
80001628: 0001 nop 80001580: 0001 nop
8000162a: 0000 unimp 80001582: 0000 unimp
8000162c: 0000 unimp 80001584: 0000 unimp
8000162e: 0000 unimp 80001586: 0000 unimp
80001630: 330e fld ft6,224(sp) 80001588: 330e fld ft6,224(sp)
80001632: abcd j 80001c24 <__BSS_END__+0x278> 8000158a: abcd j 80001b7c <__BSS_END__+0x278>
80001634: 1234 addi a3,sp,296 8000158c: 1234 addi a3,sp,296
80001636: e66d bnez a2,80001720 <impure_data+0x1a0> 8000158e: e66d bnez a2,80001678 <impure_data+0x1a0>
80001638: deec sw a1,124(a3) 80001590: deec sw a1,124(a3)
8000163a: 0005 c.nop 1 80001592: 0005 c.nop 1
8000163c: 0000000b 0xb 80001594: 0000000b 0xb
... ...
Disassembly of section .sdata: Disassembly of section .sdata:
800019a8 <_global_impure_ptr>: 80001900 <_global_impure_ptr>:
800019a8: 1580 addi s0,sp,736 80001900: 14d8 addi a4,sp,612
800019aa: 8000 0x8000 80001902: 8000 0x8000
Disassembly of section .comment: Disassembly of section .comment:

Binary file not shown.

View file

@ -9,7 +9,7 @@ VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -9,7 +9,7 @@ VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -nostartfiles -Wl,--gc-sections,-Map,kernel.map VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -nostartfiles -Wl,--gc-sections,-Map,kernel.map
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a
VX_LDFLAGS += -lm VX_LDFLAGS += -lm

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -13,7 +13,7 @@ LDFLAGS +=
PROJECT = libvortexrt PROJECT = libvortexrt
SRCS = ./src/vx_start.S ./src/vx_intrinsics.S ./src/vx_print.S ./src/vx_print.c ./src/vx_spawn.c SRCS = ./src/vx_start.S ./src/vx_print.S ./src/vx_print.c ./src/vx_spawn.c
OBJS := $(addsuffix .o, $(notdir $(SRCS))) OBJS := $(addsuffix .o, $(notdir $(SRCS)))

View file

@ -1,57 +1,113 @@
#ifndef VX_INTRINSICS_H #ifndef VX_INTRINSICS_H
#define VX_INTRINSICS_H #define VX_INTRINSICS_H
#include <VX_config.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
// Spawn warps
void vx_wspawn(int num_warps, unsigned func_ptr);
// Set thread mask // Set thread mask
void vx_tmc(int num_threads); inline void vx_tmc(unsigned num_threads) {
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(num_threads));
}
// Warp Barrier // Spawn warps
void vx_barrier(int barried_id, int num_warps); inline void vx_wspawn(unsigned num_warps, void* func_ptr) {
asm volatile (".insn s 0x6b, 1, %1, 0(%0)" :: "r"(num_warps), "r"(func_ptr));
}
// Split on a predicate // Split on a predicate
void vx_split(int predicate); inline void vx_split(int predicate) {
asm volatile (".insn s 0x6b, 2, x0, 0(%0)" :: "r"(predicate));
}
// Join // Join
void vx_join(); inline void vx_join() {
asm volatile (".insn s 0x6b, 3, x0, 0(x0)");
}
// Warp Barrier
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
asm volatile (".insn s 0x6b, 4, %1, 0cd (%0)" :: "r"(barried_id), "r"(num_warps));
}
// Return active warp's thread id // Return active warp's thread id
int vx_thread_id(); inline int vx_thread_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_WTID));
return result;
}
// Return active core's local thread id // Return active core's local thread id
int vx_thread_lid(); inline int vx_thread_lid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LTID));
return result;
}
// Return processsor global thread id // Return processsor global thread id
int vx_thread_gid(); inline int vx_thread_gid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GTID));
return result;
}
// Return active core's local warp id // Return active core's local warp id
int vx_warp_id(); inline int vx_warp_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LWID));
return result;
}
// Return processsor's global warp id // Return processsor's global warp id
int vx_warp_gid(); inline int vx_warp_gid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GWID));
return result;
}
// Return processsor core id // Return processsor core id
int vx_core_id(); inline int vx_core_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GCID));
return result;
}
// Return the number of threads in a warp // Return the number of threads in a warp
int vx_num_threads(); inline int vx_num_threads() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NT));
return result;
}
// Return the number of warps in a core // Return the number of warps in a core
int vx_num_warps(); inline int vx_num_warps() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NW));
return result;
}
// Return the number of cores in the processsor // Return the number of cores in the processsor
int vx_num_cores(); inline int vx_num_cores() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NC));
return result;
}
// Return the number of cycles // Return the number of cycles
int vx_num_cycles(); inline int vx_num_cycles() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_CYCLE));
return result;
}
// Return the number of instructions // Return the number of instructions
int vx_num_instrs(); inline int vx_num_instrs() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_INSTRET));
return result;
}
#define __if(b) vx_split(b); \ #define __if(b) vx_split(b); \
if (b) if (b)

View file

@ -8,9 +8,29 @@
extern "C" { extern "C" {
#endif #endif
typedef void (*pfn_callback)(int task_id, void *arg); struct context_t {
uint32_t num_groups[3];
uint32_t global_offset[3];
uint32_t local_size[3];
char * printf_buffer;
uint32_t *printf_buffer_position;
uint32_t printf_buffer_capacity;
uint32_t work_dim;
};
void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args); typedef void (*pfn_workgroup_func) (
const void * /* args */,
const struct context_t * /* context */,
uint32_t /* group_x */,
uint32_t /* group_y */,
uint32_t /* group_z */
);
typedef void (*pfn_callback)(int task_id, const void *arg);
void vx_spawn_kernel(struct context_t * ctx, pfn_workgroup_func wg_func, const void * args);
void vx_spawn_tasks(int num_tasks, pfn_callback callback, const void * args);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -1,99 +0,0 @@
#include <VX_config.h>
.section .text
.type vx_wspawn, @function
.global vx_wspawn
vx_wspawn:
.word 0x00b5106b # wspawn a0(num_warps), a1(func_ptr)
ret
.type vx_tmc, @function
.global vx_tmc
vx_tmc:
.word 0x0005006b # tmc a0
ret
.type vx_barrier, @function
.global vx_barrier
vx_barrier:
.word 0x00b5406b # barrier a0(barrier_id), a1(num_warps)
ret
.type vx_split, @function
.global vx_split
vx_split:
.word 0x0005206b # split a0
ret
.type vx_join, @function
.global vx_join
vx_join:
.word 0x0000306b #join
ret
.type vx_warp_id, @function
.global vx_warp_id
vx_warp_id:
csrr a0, CSR_LWID
ret
.type vx_warp_gid, @function
.global vx_warp_gid
vx_warp_gid:
csrr a0, CSR_GWID
ret
.type vx_thread_id, @function
.global vx_thread_id
vx_thread_id:
csrr a0, CSR_WTID
ret
.type vx_thread_lid, @function
.global vx_thread_lid
vx_thread_lid:
csrr a0, CSR_LTID
ret
.type vx_thread_gid, @function
.global vx_thread_gid
vx_thread_gid:
csrr a0, CSR_GTID
ret
.type vx_core_id, @function
.global vx_core_id
vx_core_id:
csrr a0, CSR_GCID
ret
.type vx_num_threads, @function
.global vx_num_threads
vx_num_threads:
csrr a0, CSR_NT
ret
.type vx_num_warps, @function
.global vx_num_warps
vx_num_warps:
csrr a0, CSR_NW
ret
.type vx_num_cores, @function
.global vx_num_cores
vx_num_cores:
csrr a0, CSR_NC
ret
.type vx_num_cycles, @function
.global vx_num_cycles
vx_num_cycles:
csrr a0, CSR_CYCLE
ret
.type vx_num_instrs, @function
.global vx_num_instrs
vx_num_instrs:
csrr a0, CSR_INSTRET
ret

View file

@ -12,13 +12,34 @@ extern "C" {
typedef struct { typedef struct {
pfn_callback callback; pfn_callback callback;
void * args; const void * args;
int offset; int offset;
int N; int N;
int R; int R;
} wspawn_args_t; } wspawn_tasks_args_t;
wspawn_args_t* g_wspawn_args[NUM_CORES_MAX]; typedef struct {
struct context_t * ctx;
pfn_workgroup_func wg_func;
const void * args;
int offset;
int N;
int R;
char isXYpow2;
char isXpow2;
char log2XY;
char log2X;
} wspawn_kernel_args_t;
void* g_wspawn_args[NUM_CORES_MAX];
inline char is_log2(int x) {
return ((x & (x-1)) == 0);
}
inline int fast_log2(int x) {
return (*(int*)(&x)>>23) - 127;
}
void spawn_tasks_callback() { void spawn_tasks_callback() {
vx_tmc(vx_num_threads()); vx_tmc(vx_num_threads());
@ -28,7 +49,7 @@ void spawn_tasks_callback() {
int tid = vx_thread_id(); int tid = vx_thread_id();
int NT = vx_num_threads(); int NT = vx_num_threads();
wspawn_args_t* p_wspawn_args = g_wspawn_args[core_id]; wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[core_id];
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid); int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R); int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
@ -47,7 +68,7 @@ void spawn_remaining_tasks_callback(int nthreads) {
int core_id = vx_core_id(); int core_id = vx_core_id();
int tid = vx_thread_gid(); int tid = vx_thread_gid();
wspawn_args_t* p_wspawn_args = g_wspawn_args[core_id]; wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[core_id];
int task_id = p_wspawn_args->offset + tid; int task_id = p_wspawn_args->offset + tid;
(p_wspawn_args->callback)(task_id, p_wspawn_args->args); (p_wspawn_args->callback)(task_id, p_wspawn_args->args);
@ -55,7 +76,7 @@ void spawn_remaining_tasks_callback(int nthreads) {
vx_tmc(1); vx_tmc(1);
} }
void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) { void vx_spawn_tasks(int num_tasks, pfn_callback callback , const void * args) {
// device specs // device specs
int NC = vx_num_cores(); int NC = vx_num_cores();
int NW = vx_num_warps(); int NW = vx_num_warps();
@ -90,7 +111,7 @@ void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) {
fW = 1; fW = 1;
//-- //--
wspawn_args_t wspawn_args = { callback, args, core_id * tasks_per_core, fW, rW }; wspawn_tasks_args_t wspawn_args = { callback, args, core_id * tasks_per_core, fW, rW };
g_wspawn_args[core_id] = &wspawn_args; g_wspawn_args[core_id] = &wspawn_args;
//-- //--
@ -107,6 +128,135 @@ void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) {
} }
} }
///////////////////////////////////////////////////////////////////////////////
void spawn_kernel_callback() {
vx_tmc(vx_num_threads());
int core_id = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
int NT = vx_num_threads();
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[core_id];
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
int X = p_wspawn_args->ctx->num_groups[0];
int Y = p_wspawn_args->ctx->num_groups[1];
int XY = X * Y;
for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {
int k = p_wspawn_args->isXYpow2 ? (wg_id / XY) : (wg_id >> p_wspawn_args->log2XY);
int wg_2d = wg_id - k * XY;
int j = p_wspawn_args->isXpow2 ? (wg_2d / X) : (wg_2d >> p_wspawn_args->log2X);
int i = wg_2d - j * X;
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
(p_wspawn_args->wg_func)(p_wspawn_args->args, p_wspawn_args->ctx, gid0, gid1, gid2);
}
vx_tmc(0 == wid);
}
void spawn_kernel_remaining_callback(int nthreads) {
vx_tmc(nthreads);
int core_id = vx_core_id();
int tid = vx_thread_gid();
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[core_id];
int wg_id = p_wspawn_args->offset + tid;
int X = p_wspawn_args->ctx->num_groups[0];
int Y = p_wspawn_args->ctx->num_groups[1];
int XY = X * Y;
int k = p_wspawn_args->isXYpow2 ? (wg_id / XY) : (wg_id >> p_wspawn_args->log2XY);
int wg_2d = wg_id - k * XY;
int j = p_wspawn_args->isXpow2 ? (wg_2d / X) : (wg_2d >> p_wspawn_args->log2X);
int i = wg_2d - j * X;
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
(p_wspawn_args->wg_func)(p_wspawn_args->args, p_wspawn_args->ctx, gid0, gid1, gid2);
vx_tmc(1);
}
void vx_spawn_kernel(struct context_t * ctx, pfn_workgroup_func wg_func, const void * args) {
// total number of WGs
int X = ctx->num_groups[0];
int Y = ctx->num_groups[1];
int Z = ctx->num_groups[2];
int XY = X * Y;
int Q = XY * Z;
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;
int nC = (Q > WT) ? (Q / WT) : 1;
int nc = MIN(nC, NC);
if (core_id >= nc)
return; // terminate extra cores
// number of workgroups per core
int wgs_per_core = Q / nc;
int wgs_per_core0 = wgs_per_core;
if (core_id == (NC-1)) {
int QC_r = Q - (nc * wgs_per_core0);
wgs_per_core0 += QC_r; // last core executes remaining WGs
}
// number of workgroups per warp
int nW = wgs_per_core0 / NT; // total warps per core
int rT = wgs_per_core0 - (nW * NT); // remaining threads
int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations
int rW = (fW != 0) ? (nW - fW * NW) : 0; // reamining full warps
if (0 == fW)
fW = 1;
// fast path handling
char isXYpow2 = is_log2(XY);
char isXpow2 = is_log2(X);
char log2XY = fast_log2(XY);
char log2X = fast_log2(X);
//--
wspawn_kernel_args_t wspawn_args = { ctx, wg_func, args, core_id * wgs_per_core, fW, rW, isXYpow2, isXpow2, log2XY, log2X };
g_wspawn_args[core_id] = &wspawn_args;
//--
if (nW >= 1) {
int nw = MIN(nW, NW);
vx_wspawn(nw, (unsigned)&spawn_kernel_callback);
spawn_kernel_callback();
}
//--
if (rT != 0) {
wspawn_args.offset = wgs_per_core0 - rT;
spawn_kernel_remaining_callback(rT);
}
}
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -8,12 +8,12 @@ _start:
# execute stack initialization on all warps # execute stack initialization on all warps
la a1, vx_set_sp la a1, vx_set_sp
csrr a0, CSR_NW # get num warps csrr a0, CSR_NW # get num warps
.word 0x00b5106b # wspawn a0, a1 .insn s 0x6b, 1, a1, 0(a0) # wspawn a0, a1
jal vx_set_sp jal vx_set_sp
# return back to single thread execution # return back to single thread execution
li a0, 1 li a0, 1
.word 0x0005006b # tmc a0 .insn s 0x6b, 0, x0, 0(a0) # tmc a0
# Clear the bss segment # Clear the bss segment
la a0, _edata la a0, _edata
@ -44,15 +44,15 @@ _start:
_exit: _exit:
# disable all threads in current warp # disable all threads in current warp
li a0, 0 li a0, 0
.word 0x0005006b # tmc a0 .insn s 0x6b, 0, x0, 0(a0) # tmc a0
.section .text .section .text
.type vx_set_sp, @function .type vx_set_sp, @function
.global vx_set_sp .global vx_set_sp
vx_set_sp: vx_set_sp:
# activate all threads # activate all threads
csrr a0, CSR_NT # get num threads csrr a0, CSR_NT # get num threads
.word 0x0005006b # set thread mask .insn s 0x6b, 0, x0, 0(a0) # tmc a0
# set global pointer register # set global pointer register
.option push .option push
@ -76,7 +76,7 @@ vx_set_sp:
csrr a3, CSR_LWID # get local wid csrr a3, CSR_LWID # get local wid
beqz a3, RETURN beqz a3, RETURN
li a0, 0 li a0, 0
.word 0x0005006b # tmc a0 .insn s 0x6b, 0, x0, 0(a0) # tmc a0
RETURN: RETURN:
ret ret

View file

@ -8,7 +8,7 @@ CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld
CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections
CFLAGS += -I$(VORTEX_RT_PATH)/include CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -8,7 +8,7 @@ CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld
CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections
CFLAGS += -I$(VORTEX_RT_PATH)/include CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

View file

@ -8,7 +8,7 @@ CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld
CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections
CFLAGS += -I$(VORTEX_RT_PATH)/include CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -8,7 +8,7 @@ CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld
CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections
CFLAGS += -I$(VORTEX_RT_PATH)/include -I../../../hw CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff