runtime instrinsics refactoring using RISC-V custom instruction assmebly directives

This commit is contained in:
Blaise Tine 2021-02-04 15:15:20 -05:00
parent a9f82bceae
commit b047f589d6
44 changed files with 90586 additions and 90486 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -9,7 +9,7 @@ VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

Binary file not shown.

View file

@ -6,451 +6,377 @@ Disassembly of section .init:
80000000 <_start>:
80000000: 00000597 auipc a1,0x0
80000004: 0e458593 addi a1,a1,228 # 800000e4 <vx_set_sp>
80000004: 0bc58593 addi a1,a1,188 # 800000bc <vx_set_sp>
80000008: fc102573 csrr a0,0xfc1
8000000c: 00b5106b 0xb5106b
80000010: 0d4000ef jal ra,800000e4 <vx_set_sp>
80000010: 0ac000ef jal ra,800000bc <vx_set_sp>
80000014: 00100513 li a0,1
80000018: 0005006b 0x5006b
8000001c: 00002517 auipc a0,0x2
80000020: 99050513 addi a0,a0,-1648 # 800019ac <__BSS_END__>
80000020: 8e850513 addi a0,a0,-1816 # 80001904 <__BSS_END__>
80000024: 00002617 auipc a2,0x2
80000028: 98860613 addi a2,a2,-1656 # 800019ac <__BSS_END__>
80000028: 8e060613 addi a2,a2,-1824 # 80001904 <__BSS_END__>
8000002c: 40a60633 sub a2,a2,a0
80000030: 00000593 li a1,0
80000034: 2ac000ef jal ra,800002e0 <memset>
80000034: 204000ef jal ra,80000238 <memset>
80000038: 00000517 auipc a0,0x0
8000003c: 1b050513 addi a0,a0,432 # 800001e8 <__libc_fini_array>
80000040: 160000ef jal ra,800001a0 <atexit>
80000044: 200000ef jal ra,80000244 <__libc_init_array>
8000003c: 10850513 addi a0,a0,264 # 80000140 <__libc_fini_array>
80000040: 0b8000ef jal ra,800000f8 <atexit>
80000044: 158000ef jal ra,8000019c <__libc_init_array>
80000048: 008000ef jal ra,80000050 <main>
8000004c: 1680006f j 800001b4 <exit>
8000004c: 0c00006f j 8000010c <exit>
Disassembly of section .text:
80000050 <main>:
80000050: ff010113 addi sp,sp,-16
80000054: 7ffff7b7 lui a5,0x7ffff
80000058: 00812423 sw s0,8(sp)
8000005c: 0007a403 lw s0,0(a5) # 7ffff000 <__stack_size+0x7fffec00>
80000060: 00912223 sw s1,4(sp)
80000064: 01212023 sw s2,0(sp)
80000068: 0087a483 lw s1,8(a5)
8000006c: 0047a903 lw s2,4(a5)
80000070: 00112623 sw ra,12(sp)
80000074: 0fc000ef jal ra,80000170 <vx_core_id>
80000078: 02850533 mul a0,a0,s0
8000007c: 02040863 beqz s0,800000ac <main+0x5c>
80000080: 00a40733 add a4,s0,a0
80000084: 00271713 slli a4,a4,0x2
80000088: 00251513 slli a0,a0,0x2
8000008c: 012507b3 add a5,a0,s2
80000090: 01270733 add a4,a4,s2
80000094: 412485b3 sub a1,s1,s2
80000098: 0007a603 lw a2,0(a5)
8000009c: 00f586b3 add a3,a1,a5
800000a0: 00478793 addi a5,a5,4
800000a4: 00c6a023 sw a2,0(a3)
800000a8: fef718e3 bne a4,a5,80000098 <main+0x48>
800000ac: 00c12083 lw ra,12(sp)
800000b0: 00812403 lw s0,8(sp)
800000b4: 00412483 lw s1,4(sp)
800000b8: 00012903 lw s2,0(sp)
800000bc: 01010113 addi sp,sp,16
800000c0: 00008067 ret
80000050: 7ffff7b7 lui a5,0x7ffff
80000054: 0007a703 lw a4,0(a5) # 7ffff000 <__stack_size+0x7fffec00>
80000058: 0047a683 lw a3,4(a5)
8000005c: 0087a583 lw a1,8(a5)
80000060: cc5027f3 csrr a5,0xcc5
80000064: 02e787b3 mul a5,a5,a4
80000068: 02070863 beqz a4,80000098 <main+0x48>
8000006c: 00f70733 add a4,a4,a5
80000070: 00271713 slli a4,a4,0x2
80000074: 00279793 slli a5,a5,0x2
80000078: 00d787b3 add a5,a5,a3
8000007c: 00d70733 add a4,a4,a3
80000080: 40d585b3 sub a1,a1,a3
80000084: 0007a603 lw a2,0(a5)
80000088: 00f586b3 add a3,a1,a5
8000008c: 00478793 addi a5,a5,4
80000090: 00c6a023 sw a2,0(a3)
80000094: fef718e3 bne a4,a5,80000084 <main+0x34>
80000098: 00008067 ret
800000c4 <register_fini>:
800000c4: 00000793 li a5,0
800000c8: 00078863 beqz a5,800000d8 <register_fini+0x14>
800000cc: 80000537 lui a0,0x80000
800000d0: 1e850513 addi a0,a0,488 # 800001e8 <__stack_top+0x810001e8>
800000d4: 0cc0006f j 800001a0 <atexit>
800000d8: 00008067 ret
8000009c <register_fini>:
8000009c: 00000793 li a5,0
800000a0: 00078863 beqz a5,800000b0 <register_fini+0x14>
800000a4: 80000537 lui a0,0x80000
800000a8: 14050513 addi a0,a0,320 # 80000140 <__stack_top+0x81000140>
800000ac: 04c0006f j 800000f8 <atexit>
800000b0: 00008067 ret
800000dc <_exit>:
800000dc: 00000513 li a0,0
800000e0: 0005006b 0x5006b
800000b4 <_exit>:
800000b4: 00000513 li a0,0
800000b8: 0005006b 0x5006b
800000e4 <vx_set_sp>:
800000e4: fc002573 csrr a0,0xfc0
800000e8: 0005006b 0x5006b
800000ec: 00002197 auipc gp,0x2
800000f0: c9418193 addi gp,gp,-876 # 80001d80 <__global_pointer>
800000f4: 7f000117 auipc sp,0x7f000
800000f8: f0c10113 addi sp,sp,-244 # ff000000 <__stack_top>
800000fc: 40000593 li a1,1024
80000100: cc102673 csrr a2,0xcc1
80000104: 02c585b3 mul a1,a1,a2
80000108: 40b10133 sub sp,sp,a1
8000010c: cc3026f3 csrr a3,0xcc3
80000110: 00068663 beqz a3,8000011c <RETURN>
80000114: 00000513 li a0,0
80000118: 0005006b 0x5006b
800000bc <vx_set_sp>:
800000bc: fc002573 csrr a0,0xfc0
800000c0: 0005006b 0x5006b
800000c4: 00002197 auipc gp,0x2
800000c8: c1418193 addi gp,gp,-1004 # 80001cd8 <__global_pointer>
800000cc: 7f000117 auipc sp,0x7f000
800000d0: f3410113 addi sp,sp,-204 # ff000000 <__stack_top>
800000d4: 40000593 li a1,1024
800000d8: cc102673 csrr a2,0xcc1
800000dc: 02c585b3 mul a1,a1,a2
800000e0: 40b10133 sub sp,sp,a1
800000e4: cc3026f3 csrr a3,0xcc3
800000e8: 00068663 beqz a3,800000f4 <RETURN>
800000ec: 00000513 li a0,0
800000f0: 0005006b 0x5006b
8000011c <RETURN>:
8000011c: 00008067 ret
800000f4 <RETURN>:
800000f4: 00008067 ret
80000120 <vx_wspawn>:
80000120: 00b5106b 0xb5106b
80000124: 00008067 ret
800000f8 <atexit>:
800000f8: 00050593 mv a1,a0
800000fc: 00000693 li a3,0
80000100: 00000613 li a2,0
80000104: 00000513 li a0,0
80000108: 20c0006f j 80000314 <__register_exitproc>
80000128 <vx_tmc>:
80000128: 0005006b 0x5006b
8000012c: 00008067 ret
8000010c <exit>:
8000010c: ff010113 addi sp,sp,-16
80000110: 00000593 li a1,0
80000114: 00812423 sw s0,8(sp)
80000118: 00112623 sw ra,12(sp)
8000011c: 00050413 mv s0,a0
80000120: 290000ef jal ra,800003b0 <__call_exitprocs>
80000124: 800027b7 lui a5,0x80002
80000128: 9007a503 lw a0,-1792(a5) # 80001900 <__stack_top+0x81001900>
8000012c: 03c52783 lw a5,60(a0)
80000130: 00078463 beqz a5,80000138 <exit+0x2c>
80000134: 000780e7 jalr a5
80000138: 00040513 mv a0,s0
8000013c: f79ff0ef jal ra,800000b4 <_exit>
80000130 <vx_barrier>:
80000130: 00b5406b 0xb5406b
80000134: 00008067 ret
80000140 <__libc_fini_array>:
80000140: ff010113 addi sp,sp,-16
80000144: 00812423 sw s0,8(sp)
80000148: 800017b7 lui a5,0x80001
8000014c: 80001437 lui s0,0x80001
80000150: 4d840413 addi s0,s0,1240 # 800014d8 <__stack_top+0x810014d8>
80000154: 4d878793 addi a5,a5,1240 # 800014d8 <__stack_top+0x810014d8>
80000158: 408787b3 sub a5,a5,s0
8000015c: 00912223 sw s1,4(sp)
80000160: 00112623 sw ra,12(sp)
80000164: 4027d493 srai s1,a5,0x2
80000168: 02048063 beqz s1,80000188 <__libc_fini_array+0x48>
8000016c: ffc78793 addi a5,a5,-4
80000170: 00878433 add s0,a5,s0
80000174: 00042783 lw a5,0(s0)
80000178: fff48493 addi s1,s1,-1
8000017c: ffc40413 addi s0,s0,-4
80000180: 000780e7 jalr a5
80000184: fe0498e3 bnez s1,80000174 <__libc_fini_array+0x34>
80000188: 00c12083 lw ra,12(sp)
8000018c: 00812403 lw s0,8(sp)
80000190: 00412483 lw s1,4(sp)
80000194: 01010113 addi sp,sp,16
80000198: 00008067 ret
80000138 <vx_split>:
80000138: 0005206b 0x5206b
8000013c: 00008067 ret
8000019c <__libc_init_array>:
8000019c: ff010113 addi sp,sp,-16
800001a0: 00812423 sw s0,8(sp)
800001a4: 01212023 sw s2,0(sp)
800001a8: 80001437 lui s0,0x80001
800001ac: 80001937 lui s2,0x80001
800001b0: 4d440793 addi a5,s0,1236 # 800014d4 <__stack_top+0x810014d4>
800001b4: 4d490913 addi s2,s2,1236 # 800014d4 <__stack_top+0x810014d4>
800001b8: 40f90933 sub s2,s2,a5
800001bc: 00112623 sw ra,12(sp)
800001c0: 00912223 sw s1,4(sp)
800001c4: 40295913 srai s2,s2,0x2
800001c8: 02090063 beqz s2,800001e8 <__libc_init_array+0x4c>
800001cc: 4d440413 addi s0,s0,1236
800001d0: 00000493 li s1,0
800001d4: 00042783 lw a5,0(s0)
800001d8: 00148493 addi s1,s1,1
800001dc: 00440413 addi s0,s0,4
800001e0: 000780e7 jalr a5
800001e4: fe9918e3 bne s2,s1,800001d4 <__libc_init_array+0x38>
800001e8: 80001437 lui s0,0x80001
800001ec: 80001937 lui s2,0x80001
800001f0: 4d440793 addi a5,s0,1236 # 800014d4 <__stack_top+0x810014d4>
800001f4: 4d890913 addi s2,s2,1240 # 800014d8 <__stack_top+0x810014d8>
800001f8: 40f90933 sub s2,s2,a5
800001fc: 40295913 srai s2,s2,0x2
80000200: 02090063 beqz s2,80000220 <__libc_init_array+0x84>
80000204: 4d440413 addi s0,s0,1236
80000208: 00000493 li s1,0
8000020c: 00042783 lw a5,0(s0)
80000210: 00148493 addi s1,s1,1
80000214: 00440413 addi s0,s0,4
80000218: 000780e7 jalr a5
8000021c: fe9918e3 bne s2,s1,8000020c <__libc_init_array+0x70>
80000220: 00c12083 lw ra,12(sp)
80000224: 00812403 lw s0,8(sp)
80000228: 00412483 lw s1,4(sp)
8000022c: 00012903 lw s2,0(sp)
80000230: 01010113 addi sp,sp,16
80000234: 00008067 ret
80000140 <vx_join>:
80000140: 0000306b 0x306b
80000144: 00008067 ret
80000238 <memset>:
80000238: 00f00313 li t1,15
8000023c: 00050713 mv a4,a0
80000240: 02c37e63 bgeu t1,a2,8000027c <memset+0x44>
80000244: 00f77793 andi a5,a4,15
80000248: 0a079063 bnez a5,800002e8 <memset+0xb0>
8000024c: 08059263 bnez a1,800002d0 <memset+0x98>
80000250: ff067693 andi a3,a2,-16
80000254: 00f67613 andi a2,a2,15
80000258: 00e686b3 add a3,a3,a4
8000025c: 00b72023 sw a1,0(a4)
80000260: 00b72223 sw a1,4(a4)
80000264: 00b72423 sw a1,8(a4)
80000268: 00b72623 sw a1,12(a4)
8000026c: 01070713 addi a4,a4,16
80000270: fed766e3 bltu a4,a3,8000025c <memset+0x24>
80000274: 00061463 bnez a2,8000027c <memset+0x44>
80000278: 00008067 ret
8000027c: 40c306b3 sub a3,t1,a2
80000280: 00269693 slli a3,a3,0x2
80000284: 00000297 auipc t0,0x0
80000288: 005686b3 add a3,a3,t0
8000028c: 00c68067 jr 12(a3)
80000290: 00b70723 sb a1,14(a4)
80000294: 00b706a3 sb a1,13(a4)
80000298: 00b70623 sb a1,12(a4)
8000029c: 00b705a3 sb a1,11(a4)
800002a0: 00b70523 sb a1,10(a4)
800002a4: 00b704a3 sb a1,9(a4)
800002a8: 00b70423 sb a1,8(a4)
800002ac: 00b703a3 sb a1,7(a4)
800002b0: 00b70323 sb a1,6(a4)
800002b4: 00b702a3 sb a1,5(a4)
800002b8: 00b70223 sb a1,4(a4)
800002bc: 00b701a3 sb a1,3(a4)
800002c0: 00b70123 sb a1,2(a4)
800002c4: 00b700a3 sb a1,1(a4)
800002c8: 00b70023 sb a1,0(a4)
800002cc: 00008067 ret
800002d0: 0ff5f593 andi a1,a1,255
800002d4: 00859693 slli a3,a1,0x8
800002d8: 00d5e5b3 or a1,a1,a3
800002dc: 01059693 slli a3,a1,0x10
800002e0: 00d5e5b3 or a1,a1,a3
800002e4: f6dff06f j 80000250 <memset+0x18>
800002e8: 00279693 slli a3,a5,0x2
800002ec: 00000297 auipc t0,0x0
800002f0: 005686b3 add a3,a3,t0
800002f4: 00008293 mv t0,ra
800002f8: fa0680e7 jalr -96(a3)
800002fc: 00028093 mv ra,t0
80000300: ff078793 addi a5,a5,-16
80000304: 40f70733 sub a4,a4,a5
80000308: 00f60633 add a2,a2,a5
8000030c: f6c378e3 bgeu t1,a2,8000027c <memset+0x44>
80000310: f3dff06f j 8000024c <memset+0x14>
80000148 <vx_warp_id>:
80000148: cc302573 csrr a0,0xcc3
8000014c: 00008067 ret
80000150 <vx_warp_gid>:
80000150: f1402573 csrr a0,mhartid
80000154: 00008067 ret
80000158 <vx_thread_id>:
80000158: cc002573 csrr a0,0xcc0
8000015c: 00008067 ret
80000160 <vx_thread_lid>:
80000160: cc102573 csrr a0,0xcc1
80000164: 00008067 ret
80000168 <vx_thread_gid>:
80000168: cc202573 csrr a0,0xcc2
8000016c: 00008067 ret
80000170 <vx_core_id>:
80000170: cc502573 csrr a0,0xcc5
80000174: 00008067 ret
80000178 <vx_num_threads>:
80000178: fc002573 csrr a0,0xfc0
8000017c: 00008067 ret
80000180 <vx_num_warps>:
80000180: fc102573 csrr a0,0xfc1
80000184: 00008067 ret
80000188 <vx_num_cores>:
80000188: fc202573 csrr a0,0xfc2
8000018c: 00008067 ret
80000190 <vx_num_cycles>:
80000190: c0002573 rdcycle a0
80000194: 00008067 ret
80000198 <vx_num_instrs>:
80000198: c0202573 rdinstret a0
8000019c: 00008067 ret
800001a0 <atexit>:
800001a0: 00050593 mv a1,a0
800001a4: 00000693 li a3,0
800001a8: 00000613 li a2,0
800001ac: 00000513 li a0,0
800001b0: 20c0006f j 800003bc <__register_exitproc>
800001b4 <exit>:
800001b4: ff010113 addi sp,sp,-16
800001b8: 00000593 li a1,0
800001bc: 00812423 sw s0,8(sp)
800001c0: 00112623 sw ra,12(sp)
800001c4: 00050413 mv s0,a0
800001c8: 290000ef jal ra,80000458 <__call_exitprocs>
800001cc: 800027b7 lui a5,0x80002
800001d0: 9a87a503 lw a0,-1624(a5) # 800019a8 <__stack_top+0x810019a8>
800001d4: 03c52783 lw a5,60(a0)
800001d8: 00078463 beqz a5,800001e0 <exit+0x2c>
800001dc: 000780e7 jalr a5
800001e0: 00040513 mv a0,s0
800001e4: ef9ff0ef jal ra,800000dc <_exit>
800001e8 <__libc_fini_array>:
800001e8: ff010113 addi sp,sp,-16
800001ec: 00812423 sw s0,8(sp)
800001f0: 800017b7 lui a5,0x80001
800001f4: 80001437 lui s0,0x80001
800001f8: 58040413 addi s0,s0,1408 # 80001580 <__stack_top+0x81001580>
800001fc: 58078793 addi a5,a5,1408 # 80001580 <__stack_top+0x81001580>
80000200: 408787b3 sub a5,a5,s0
80000204: 00912223 sw s1,4(sp)
80000208: 00112623 sw ra,12(sp)
8000020c: 4027d493 srai s1,a5,0x2
80000210: 02048063 beqz s1,80000230 <__libc_fini_array+0x48>
80000214: ffc78793 addi a5,a5,-4
80000218: 00878433 add s0,a5,s0
8000021c: 00042783 lw a5,0(s0)
80000220: fff48493 addi s1,s1,-1
80000224: ffc40413 addi s0,s0,-4
80000228: 000780e7 jalr a5
8000022c: fe0498e3 bnez s1,8000021c <__libc_fini_array+0x34>
80000230: 00c12083 lw ra,12(sp)
80000234: 00812403 lw s0,8(sp)
80000238: 00412483 lw s1,4(sp)
8000023c: 01010113 addi sp,sp,16
80000240: 00008067 ret
80000244 <__libc_init_array>:
80000244: ff010113 addi sp,sp,-16
80000248: 00812423 sw s0,8(sp)
8000024c: 01212023 sw s2,0(sp)
80000250: 80001437 lui s0,0x80001
80000254: 80001937 lui s2,0x80001
80000258: 57c40793 addi a5,s0,1404 # 8000157c <__stack_top+0x8100157c>
8000025c: 57c90913 addi s2,s2,1404 # 8000157c <__stack_top+0x8100157c>
80000260: 40f90933 sub s2,s2,a5
80000264: 00112623 sw ra,12(sp)
80000268: 00912223 sw s1,4(sp)
8000026c: 40295913 srai s2,s2,0x2
80000270: 02090063 beqz s2,80000290 <__libc_init_array+0x4c>
80000274: 57c40413 addi s0,s0,1404
80000278: 00000493 li s1,0
8000027c: 00042783 lw a5,0(s0)
80000280: 00148493 addi s1,s1,1
80000284: 00440413 addi s0,s0,4
80000288: 000780e7 jalr a5
8000028c: fe9918e3 bne s2,s1,8000027c <__libc_init_array+0x38>
80000290: 80001437 lui s0,0x80001
80000294: 80001937 lui s2,0x80001
80000298: 57c40793 addi a5,s0,1404 # 8000157c <__stack_top+0x8100157c>
8000029c: 58090913 addi s2,s2,1408 # 80001580 <__stack_top+0x81001580>
800002a0: 40f90933 sub s2,s2,a5
800002a4: 40295913 srai s2,s2,0x2
800002a8: 02090063 beqz s2,800002c8 <__libc_init_array+0x84>
800002ac: 57c40413 addi s0,s0,1404
800002b0: 00000493 li s1,0
800002b4: 00042783 lw a5,0(s0)
800002b8: 00148493 addi s1,s1,1
800002bc: 00440413 addi s0,s0,4
800002c0: 000780e7 jalr a5
800002c4: fe9918e3 bne s2,s1,800002b4 <__libc_init_array+0x70>
800002c8: 00c12083 lw ra,12(sp)
800002cc: 00812403 lw s0,8(sp)
800002d0: 00412483 lw s1,4(sp)
800002d4: 00012903 lw s2,0(sp)
800002d8: 01010113 addi sp,sp,16
800002dc: 00008067 ret
800002e0 <memset>:
800002e0: 00f00313 li t1,15
800002e4: 00050713 mv a4,a0
800002e8: 02c37e63 bgeu t1,a2,80000324 <memset+0x44>
800002ec: 00f77793 andi a5,a4,15
800002f0: 0a079063 bnez a5,80000390 <memset+0xb0>
800002f4: 08059263 bnez a1,80000378 <memset+0x98>
800002f8: ff067693 andi a3,a2,-16
800002fc: 00f67613 andi a2,a2,15
80000300: 00e686b3 add a3,a3,a4
80000304: 00b72023 sw a1,0(a4)
80000308: 00b72223 sw a1,4(a4)
8000030c: 00b72423 sw a1,8(a4)
80000310: 00b72623 sw a1,12(a4)
80000314: 01070713 addi a4,a4,16
80000318: fed766e3 bltu a4,a3,80000304 <memset+0x24>
8000031c: 00061463 bnez a2,80000324 <memset+0x44>
80000320: 00008067 ret
80000324: 40c306b3 sub a3,t1,a2
80000328: 00269693 slli a3,a3,0x2
8000032c: 00000297 auipc t0,0x0
80000330: 005686b3 add a3,a3,t0
80000334: 00c68067 jr 12(a3)
80000338: 00b70723 sb a1,14(a4)
8000033c: 00b706a3 sb a1,13(a4)
80000340: 00b70623 sb a1,12(a4)
80000344: 00b705a3 sb a1,11(a4)
80000348: 00b70523 sb a1,10(a4)
8000034c: 00b704a3 sb a1,9(a4)
80000350: 00b70423 sb a1,8(a4)
80000354: 00b703a3 sb a1,7(a4)
80000358: 00b70323 sb a1,6(a4)
8000035c: 00b702a3 sb a1,5(a4)
80000360: 00b70223 sb a1,4(a4)
80000364: 00b701a3 sb a1,3(a4)
80000368: 00b70123 sb a1,2(a4)
8000036c: 00b700a3 sb a1,1(a4)
80000370: 00b70023 sb a1,0(a4)
80000314 <__register_exitproc>:
80000314: 800027b7 lui a5,0x80002
80000318: 9007a703 lw a4,-1792(a5) # 80001900 <__stack_top+0x81001900>
8000031c: 14872783 lw a5,328(a4)
80000320: 04078c63 beqz a5,80000378 <__register_exitproc+0x64>
80000324: 0047a703 lw a4,4(a5)
80000328: 01f00813 li a6,31
8000032c: 06e84e63 blt a6,a4,800003a8 <__register_exitproc+0x94>
80000330: 00271813 slli a6,a4,0x2
80000334: 02050663 beqz a0,80000360 <__register_exitproc+0x4c>
80000338: 01078333 add t1,a5,a6
8000033c: 08c32423 sw a2,136(t1)
80000340: 1887a883 lw a7,392(a5)
80000344: 00100613 li a2,1
80000348: 00e61633 sll a2,a2,a4
8000034c: 00c8e8b3 or a7,a7,a2
80000350: 1917a423 sw a7,392(a5)
80000354: 10d32423 sw a3,264(t1)
80000358: 00200693 li a3,2
8000035c: 02d50463 beq a0,a3,80000384 <__register_exitproc+0x70>
80000360: 00170713 addi a4,a4,1
80000364: 00e7a223 sw a4,4(a5)
80000368: 010787b3 add a5,a5,a6
8000036c: 00b7a423 sw a1,8(a5)
80000370: 00000513 li a0,0
80000374: 00008067 ret
80000378: 0ff5f593 andi a1,a1,255
8000037c: 00859693 slli a3,a1,0x8
80000380: 00d5e5b3 or a1,a1,a3
80000384: 01059693 slli a3,a1,0x10
80000388: 00d5e5b3 or a1,a1,a3
8000038c: f6dff06f j 800002f8 <memset+0x18>
80000390: 00279693 slli a3,a5,0x2
80000394: 00000297 auipc t0,0x0
80000398: 005686b3 add a3,a3,t0
8000039c: 00008293 mv t0,ra
800003a0: fa0680e7 jalr -96(a3)
800003a4: 00028093 mv ra,t0
800003a8: ff078793 addi a5,a5,-16
800003ac: 40f70733 sub a4,a4,a5
800003b0: 00f60633 add a2,a2,a5
800003b4: f6c378e3 bgeu t1,a2,80000324 <memset+0x44>
800003b8: f3dff06f j 800002f4 <memset+0x14>
80000378: 14c70793 addi a5,a4,332
8000037c: 14f72423 sw a5,328(a4)
80000380: fa5ff06f j 80000324 <__register_exitproc+0x10>
80000384: 18c7a683 lw a3,396(a5)
80000388: 00170713 addi a4,a4,1
8000038c: 00e7a223 sw a4,4(a5)
80000390: 00c6e633 or a2,a3,a2
80000394: 18c7a623 sw a2,396(a5)
80000398: 010787b3 add a5,a5,a6
8000039c: 00b7a423 sw a1,8(a5)
800003a0: 00000513 li a0,0
800003a4: 00008067 ret
800003a8: fff00513 li a0,-1
800003ac: 00008067 ret
800003bc <__register_exitproc>:
800003bc: 800027b7 lui a5,0x80002
800003c0: 9a87a703 lw a4,-1624(a5) # 800019a8 <__stack_top+0x810019a8>
800003c4: 14872783 lw a5,328(a4)
800003c8: 04078c63 beqz a5,80000420 <__register_exitproc+0x64>
800003cc: 0047a703 lw a4,4(a5)
800003d0: 01f00813 li a6,31
800003d4: 06e84e63 blt a6,a4,80000450 <__register_exitproc+0x94>
800003d8: 00271813 slli a6,a4,0x2
800003dc: 02050663 beqz a0,80000408 <__register_exitproc+0x4c>
800003e0: 01078333 add t1,a5,a6
800003e4: 08c32423 sw a2,136(t1)
800003e8: 1887a883 lw a7,392(a5)
800003ec: 00100613 li a2,1
800003f0: 00e61633 sll a2,a2,a4
800003f4: 00c8e8b3 or a7,a7,a2
800003f8: 1917a423 sw a7,392(a5)
800003fc: 10d32423 sw a3,264(t1)
80000400: 00200693 li a3,2
80000404: 02d50463 beq a0,a3,8000042c <__register_exitproc+0x70>
80000408: 00170713 addi a4,a4,1
8000040c: 00e7a223 sw a4,4(a5)
80000410: 010787b3 add a5,a5,a6
80000414: 00b7a423 sw a1,8(a5)
80000418: 00000513 li a0,0
8000041c: 00008067 ret
80000420: 14c70793 addi a5,a4,332
80000424: 14f72423 sw a5,328(a4)
80000428: fa5ff06f j 800003cc <__register_exitproc+0x10>
8000042c: 18c7a683 lw a3,396(a5)
80000430: 00170713 addi a4,a4,1
80000434: 00e7a223 sw a4,4(a5)
80000438: 00c6e633 or a2,a3,a2
8000043c: 18c7a623 sw a2,396(a5)
80000440: 010787b3 add a5,a5,a6
80000444: 00b7a423 sw a1,8(a5)
80000448: 00000513 li a0,0
8000044c: 00008067 ret
80000450: fff00513 li a0,-1
800003b0 <__call_exitprocs>:
800003b0: fd010113 addi sp,sp,-48
800003b4: 800027b7 lui a5,0x80002
800003b8: 01412c23 sw s4,24(sp)
800003bc: 9007aa03 lw s4,-1792(a5) # 80001900 <__stack_top+0x81001900>
800003c0: 03212023 sw s2,32(sp)
800003c4: 02112623 sw ra,44(sp)
800003c8: 148a2903 lw s2,328(s4)
800003cc: 02812423 sw s0,40(sp)
800003d0: 02912223 sw s1,36(sp)
800003d4: 01312e23 sw s3,28(sp)
800003d8: 01512a23 sw s5,20(sp)
800003dc: 01612823 sw s6,16(sp)
800003e0: 01712623 sw s7,12(sp)
800003e4: 01812423 sw s8,8(sp)
800003e8: 04090063 beqz s2,80000428 <__call_exitprocs+0x78>
800003ec: 00050b13 mv s6,a0
800003f0: 00058b93 mv s7,a1
800003f4: 00100a93 li s5,1
800003f8: fff00993 li s3,-1
800003fc: 00492483 lw s1,4(s2)
80000400: fff48413 addi s0,s1,-1
80000404: 02044263 bltz s0,80000428 <__call_exitprocs+0x78>
80000408: 00249493 slli s1,s1,0x2
8000040c: 009904b3 add s1,s2,s1
80000410: 040b8463 beqz s7,80000458 <__call_exitprocs+0xa8>
80000414: 1044a783 lw a5,260(s1)
80000418: 05778063 beq a5,s7,80000458 <__call_exitprocs+0xa8>
8000041c: fff40413 addi s0,s0,-1
80000420: ffc48493 addi s1,s1,-4
80000424: ff3416e3 bne s0,s3,80000410 <__call_exitprocs+0x60>
80000428: 02c12083 lw ra,44(sp)
8000042c: 02812403 lw s0,40(sp)
80000430: 02412483 lw s1,36(sp)
80000434: 02012903 lw s2,32(sp)
80000438: 01c12983 lw s3,28(sp)
8000043c: 01812a03 lw s4,24(sp)
80000440: 01412a83 lw s5,20(sp)
80000444: 01012b03 lw s6,16(sp)
80000448: 00c12b83 lw s7,12(sp)
8000044c: 00812c03 lw s8,8(sp)
80000450: 03010113 addi sp,sp,48
80000454: 00008067 ret
80000458 <__call_exitprocs>:
80000458: fd010113 addi sp,sp,-48
8000045c: 800027b7 lui a5,0x80002
80000460: 01412c23 sw s4,24(sp)
80000464: 9a87aa03 lw s4,-1624(a5) # 800019a8 <__stack_top+0x810019a8>
80000468: 03212023 sw s2,32(sp)
8000046c: 02112623 sw ra,44(sp)
80000470: 148a2903 lw s2,328(s4)
80000474: 02812423 sw s0,40(sp)
80000478: 02912223 sw s1,36(sp)
8000047c: 01312e23 sw s3,28(sp)
80000480: 01512a23 sw s5,20(sp)
80000484: 01612823 sw s6,16(sp)
80000488: 01712623 sw s7,12(sp)
8000048c: 01812423 sw s8,8(sp)
80000490: 04090063 beqz s2,800004d0 <__call_exitprocs+0x78>
80000494: 00050b13 mv s6,a0
80000498: 00058b93 mv s7,a1
8000049c: 00100a93 li s5,1
800004a0: fff00993 li s3,-1
800004a4: 00492483 lw s1,4(s2)
800004a8: fff48413 addi s0,s1,-1
800004ac: 02044263 bltz s0,800004d0 <__call_exitprocs+0x78>
800004b0: 00249493 slli s1,s1,0x2
800004b4: 009904b3 add s1,s2,s1
800004b8: 040b8463 beqz s7,80000500 <__call_exitprocs+0xa8>
800004bc: 1044a783 lw a5,260(s1)
800004c0: 05778063 beq a5,s7,80000500 <__call_exitprocs+0xa8>
800004c4: fff40413 addi s0,s0,-1
800004c8: ffc48493 addi s1,s1,-4
800004cc: ff3416e3 bne s0,s3,800004b8 <__call_exitprocs+0x60>
800004d0: 02c12083 lw ra,44(sp)
800004d4: 02812403 lw s0,40(sp)
800004d8: 02412483 lw s1,36(sp)
800004dc: 02012903 lw s2,32(sp)
800004e0: 01c12983 lw s3,28(sp)
800004e4: 01812a03 lw s4,24(sp)
800004e8: 01412a83 lw s5,20(sp)
800004ec: 01012b03 lw s6,16(sp)
800004f0: 00c12b83 lw s7,12(sp)
800004f4: 00812c03 lw s8,8(sp)
800004f8: 03010113 addi sp,sp,48
800004fc: 00008067 ret
80000500: 00492783 lw a5,4(s2)
80000504: 0044a683 lw a3,4(s1)
80000508: fff78793 addi a5,a5,-1
8000050c: 04878e63 beq a5,s0,80000568 <__call_exitprocs+0x110>
80000510: 0004a223 sw zero,4(s1)
80000514: fa0688e3 beqz a3,800004c4 <__call_exitprocs+0x6c>
80000518: 18892783 lw a5,392(s2)
8000051c: 008a9733 sll a4,s5,s0
80000520: 00492c03 lw s8,4(s2)
80000524: 00f777b3 and a5,a4,a5
80000528: 02079263 bnez a5,8000054c <__call_exitprocs+0xf4>
8000052c: 000680e7 jalr a3
80000530: 00492703 lw a4,4(s2)
80000534: 148a2783 lw a5,328(s4)
80000538: 01871463 bne a4,s8,80000540 <__call_exitprocs+0xe8>
8000053c: f8f904e3 beq s2,a5,800004c4 <__call_exitprocs+0x6c>
80000540: f80788e3 beqz a5,800004d0 <__call_exitprocs+0x78>
80000544: 00078913 mv s2,a5
80000548: f5dff06f j 800004a4 <__call_exitprocs+0x4c>
8000054c: 18c92783 lw a5,396(s2)
80000550: 0844a583 lw a1,132(s1)
80000554: 00f77733 and a4,a4,a5
80000558: 00071c63 bnez a4,80000570 <__call_exitprocs+0x118>
8000055c: 000b0513 mv a0,s6
80000560: 000680e7 jalr a3
80000564: fcdff06f j 80000530 <__call_exitprocs+0xd8>
80000568: 00892223 sw s0,4(s2)
8000056c: fa9ff06f j 80000514 <__call_exitprocs+0xbc>
80000570: 00058513 mv a0,a1
80000574: 000680e7 jalr a3
80000578: fb9ff06f j 80000530 <__call_exitprocs+0xd8>
80000458: 00492783 lw a5,4(s2)
8000045c: 0044a683 lw a3,4(s1)
80000460: fff78793 addi a5,a5,-1
80000464: 04878e63 beq a5,s0,800004c0 <__call_exitprocs+0x110>
80000468: 0004a223 sw zero,4(s1)
8000046c: fa0688e3 beqz a3,8000041c <__call_exitprocs+0x6c>
80000470: 18892783 lw a5,392(s2)
80000474: 008a9733 sll a4,s5,s0
80000478: 00492c03 lw s8,4(s2)
8000047c: 00f777b3 and a5,a4,a5
80000480: 02079263 bnez a5,800004a4 <__call_exitprocs+0xf4>
80000484: 000680e7 jalr a3
80000488: 00492703 lw a4,4(s2)
8000048c: 148a2783 lw a5,328(s4)
80000490: 01871463 bne a4,s8,80000498 <__call_exitprocs+0xe8>
80000494: f8f904e3 beq s2,a5,8000041c <__call_exitprocs+0x6c>
80000498: f80788e3 beqz a5,80000428 <__call_exitprocs+0x78>
8000049c: 00078913 mv s2,a5
800004a0: f5dff06f j 800003fc <__call_exitprocs+0x4c>
800004a4: 18c92783 lw a5,396(s2)
800004a8: 0844a583 lw a1,132(s1)
800004ac: 00f77733 and a4,a4,a5
800004b0: 00071c63 bnez a4,800004c8 <__call_exitprocs+0x118>
800004b4: 000b0513 mv a0,s6
800004b8: 000680e7 jalr a3
800004bc: fcdff06f j 80000488 <__call_exitprocs+0xd8>
800004c0: 00892223 sw s0,4(s2)
800004c4: fa9ff06f j 8000046c <__call_exitprocs+0xbc>
800004c8: 00058513 mv a0,a1
800004cc: 000680e7 jalr a3
800004d0: fb9ff06f j 80000488 <__call_exitprocs+0xd8>
Disassembly of section .init_array:
8000157c <__init_array_start>:
8000157c: 00c4 addi s1,sp,68
8000157e: 8000 0x8000
800014d4 <__init_array_start>:
800014d4: 009c addi a5,sp,64
800014d6: 8000 0x8000
Disassembly of section .data:
80001580 <impure_data>:
80001580: 0000 unimp
80001582: 0000 unimp
80001584: 186c addi a1,sp,60
80001586: 8000 0x8000
80001588: 18d4 addi a3,sp,116
8000158a: 8000 0x8000
8000158c: 193c addi a5,sp,184
8000158e: 8000 0x8000
800014d8 <impure_data>:
800014d8: 0000 unimp
800014da: 0000 unimp
800014dc: 17c4 addi s1,sp,996
800014de: 8000 0x8000
800014e0: 182c addi a1,sp,56
800014e2: 8000 0x8000
800014e4: 1894 addi a3,sp,112
800014e6: 8000 0x8000
...
80001628: 0001 nop
8000162a: 0000 unimp
8000162c: 0000 unimp
8000162e: 0000 unimp
80001630: 330e fld ft6,224(sp)
80001632: abcd j 80001c24 <__BSS_END__+0x278>
80001634: 1234 addi a3,sp,296
80001636: e66d bnez a2,80001720 <impure_data+0x1a0>
80001638: deec sw a1,124(a3)
8000163a: 0005 c.nop 1
8000163c: 0000000b 0xb
80001580: 0001 nop
80001582: 0000 unimp
80001584: 0000 unimp
80001586: 0000 unimp
80001588: 330e fld ft6,224(sp)
8000158a: abcd j 80001b7c <__BSS_END__+0x278>
8000158c: 1234 addi a3,sp,296
8000158e: e66d bnez a2,80001678 <impure_data+0x1a0>
80001590: deec sw a1,124(a3)
80001592: 0005 c.nop 1
80001594: 0000000b 0xb
...
Disassembly of section .sdata:
800019a8 <_global_impure_ptr>:
800019a8: 1580 addi s0,sp,736
800019aa: 8000 0x8000
80001900 <_global_impure_ptr>:
80001900: 14d8 addi a4,sp,612
80001902: 8000 0x8000
Disassembly of section .comment:

Binary file not shown.

View file

@ -9,7 +9,7 @@ VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -9,7 +9,7 @@ VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -nostartfiles -Wl,--gc-sections,-Map,kernel.map
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a
VX_LDFLAGS += -lm

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -13,7 +13,7 @@ LDFLAGS +=
PROJECT = libvortexrt
SRCS = ./src/vx_start.S ./src/vx_intrinsics.S ./src/vx_print.S ./src/vx_print.c ./src/vx_spawn.c
SRCS = ./src/vx_start.S ./src/vx_print.S ./src/vx_print.c ./src/vx_spawn.c
OBJS := $(addsuffix .o, $(notdir $(SRCS)))

View file

@ -1,57 +1,113 @@
#ifndef VX_INTRINSICS_H
#define VX_INTRINSICS_H
#include <VX_config.h>
#ifdef __cplusplus
extern "C" {
#endif
// Spawn warps
void vx_wspawn(int num_warps, unsigned func_ptr);
// Set thread mask
void vx_tmc(int num_threads);
inline void vx_tmc(unsigned num_threads) {
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(num_threads));
}
// Warp Barrier
void vx_barrier(int barried_id, int num_warps);
// Spawn warps
inline void vx_wspawn(unsigned num_warps, void* func_ptr) {
asm volatile (".insn s 0x6b, 1, %1, 0(%0)" :: "r"(num_warps), "r"(func_ptr));
}
// Split on a predicate
void vx_split(int predicate);
inline void vx_split(int predicate) {
asm volatile (".insn s 0x6b, 2, x0, 0(%0)" :: "r"(predicate));
}
// Join
void vx_join();
inline void vx_join() {
asm volatile (".insn s 0x6b, 3, x0, 0(x0)");
}
// Warp Barrier
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
asm volatile (".insn s 0x6b, 4, %1, 0cd (%0)" :: "r"(barried_id), "r"(num_warps));
}
// Return active warp's thread id
int vx_thread_id();
inline int vx_thread_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_WTID));
return result;
}
// Return active core's local thread id
int vx_thread_lid();
inline int vx_thread_lid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LTID));
return result;
}
// Return processsor global thread id
int vx_thread_gid();
inline int vx_thread_gid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GTID));
return result;
}
// Return active core's local warp id
int vx_warp_id();
inline int vx_warp_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LWID));
return result;
}
// Return processsor's global warp id
int vx_warp_gid();
inline int vx_warp_gid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GWID));
return result;
}
// Return processsor core id
int vx_core_id();
inline int vx_core_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GCID));
return result;
}
// Return the number of threads in a warp
int vx_num_threads();
inline int vx_num_threads() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NT));
return result;
}
// Return the number of warps in a core
int vx_num_warps();
inline int vx_num_warps() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NW));
return result;
}
// Return the number of cores in the processsor
int vx_num_cores();
inline int vx_num_cores() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NC));
return result;
}
// Return the number of cycles
int vx_num_cycles();
inline int vx_num_cycles() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_CYCLE));
return result;
}
// Return the number of instructions
int vx_num_instrs();
inline int vx_num_instrs() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_INSTRET));
return result;
}
#define __if(b) vx_split(b); \
if (b)

View file

@ -8,9 +8,29 @@
extern "C" {
#endif
typedef void (*pfn_callback)(int task_id, void *arg);
struct context_t {
uint32_t num_groups[3];
uint32_t global_offset[3];
uint32_t local_size[3];
char * printf_buffer;
uint32_t *printf_buffer_position;
uint32_t printf_buffer_capacity;
uint32_t work_dim;
};
void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args);
typedef void (*pfn_workgroup_func) (
const void * /* args */,
const struct context_t * /* context */,
uint32_t /* group_x */,
uint32_t /* group_y */,
uint32_t /* group_z */
);
typedef void (*pfn_callback)(int task_id, const void *arg);
void vx_spawn_kernel(struct context_t * ctx, pfn_workgroup_func wg_func, const void * args);
void vx_spawn_tasks(int num_tasks, pfn_callback callback, const void * args);
#ifdef __cplusplus
}

View file

@ -1,99 +0,0 @@
#include <VX_config.h>
.section .text
.type vx_wspawn, @function
.global vx_wspawn
vx_wspawn:
.word 0x00b5106b # wspawn a0(num_warps), a1(func_ptr)
ret
.type vx_tmc, @function
.global vx_tmc
vx_tmc:
.word 0x0005006b # tmc a0
ret
.type vx_barrier, @function
.global vx_barrier
vx_barrier:
.word 0x00b5406b # barrier a0(barrier_id), a1(num_warps)
ret
.type vx_split, @function
.global vx_split
vx_split:
.word 0x0005206b # split a0
ret
.type vx_join, @function
.global vx_join
vx_join:
.word 0x0000306b #join
ret
.type vx_warp_id, @function
.global vx_warp_id
vx_warp_id:
csrr a0, CSR_LWID
ret
.type vx_warp_gid, @function
.global vx_warp_gid
vx_warp_gid:
csrr a0, CSR_GWID
ret
.type vx_thread_id, @function
.global vx_thread_id
vx_thread_id:
csrr a0, CSR_WTID
ret
.type vx_thread_lid, @function
.global vx_thread_lid
vx_thread_lid:
csrr a0, CSR_LTID
ret
.type vx_thread_gid, @function
.global vx_thread_gid
vx_thread_gid:
csrr a0, CSR_GTID
ret
.type vx_core_id, @function
.global vx_core_id
vx_core_id:
csrr a0, CSR_GCID
ret
.type vx_num_threads, @function
.global vx_num_threads
vx_num_threads:
csrr a0, CSR_NT
ret
.type vx_num_warps, @function
.global vx_num_warps
vx_num_warps:
csrr a0, CSR_NW
ret
.type vx_num_cores, @function
.global vx_num_cores
vx_num_cores:
csrr a0, CSR_NC
ret
.type vx_num_cycles, @function
.global vx_num_cycles
vx_num_cycles:
csrr a0, CSR_CYCLE
ret
.type vx_num_instrs, @function
.global vx_num_instrs
vx_num_instrs:
csrr a0, CSR_INSTRET
ret

View file

@ -12,13 +12,34 @@ extern "C" {
typedef struct {
pfn_callback callback;
void * args;
const void * args;
int offset;
int N;
int R;
} wspawn_args_t;
} wspawn_tasks_args_t;
wspawn_args_t* g_wspawn_args[NUM_CORES_MAX];
typedef struct {
struct context_t * ctx;
pfn_workgroup_func wg_func;
const void * args;
int offset;
int N;
int R;
char isXYpow2;
char isXpow2;
char log2XY;
char log2X;
} wspawn_kernel_args_t;
void* g_wspawn_args[NUM_CORES_MAX];
inline char is_log2(int x) {
return ((x & (x-1)) == 0);
}
inline int fast_log2(int x) {
return (*(int*)(&x)>>23) - 127;
}
void spawn_tasks_callback() {
vx_tmc(vx_num_threads());
@ -28,7 +49,7 @@ void spawn_tasks_callback() {
int tid = vx_thread_id();
int NT = vx_num_threads();
wspawn_args_t* p_wspawn_args = g_wspawn_args[core_id];
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[core_id];
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
@ -47,7 +68,7 @@ void spawn_remaining_tasks_callback(int nthreads) {
int core_id = vx_core_id();
int tid = vx_thread_gid();
wspawn_args_t* p_wspawn_args = g_wspawn_args[core_id];
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[core_id];
int task_id = p_wspawn_args->offset + tid;
(p_wspawn_args->callback)(task_id, p_wspawn_args->args);
@ -55,7 +76,7 @@ void spawn_remaining_tasks_callback(int nthreads) {
vx_tmc(1);
}
void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) {
void vx_spawn_tasks(int num_tasks, pfn_callback callback , const void * args) {
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
@ -90,7 +111,7 @@ void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) {
fW = 1;
//--
wspawn_args_t wspawn_args = { callback, args, core_id * tasks_per_core, fW, rW };
wspawn_tasks_args_t wspawn_args = { callback, args, core_id * tasks_per_core, fW, rW };
g_wspawn_args[core_id] = &wspawn_args;
//--
@ -107,6 +128,135 @@ void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) {
}
}
///////////////////////////////////////////////////////////////////////////////
void spawn_kernel_callback() {
vx_tmc(vx_num_threads());
int core_id = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
int NT = vx_num_threads();
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[core_id];
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
int X = p_wspawn_args->ctx->num_groups[0];
int Y = p_wspawn_args->ctx->num_groups[1];
int XY = X * Y;
for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {
int k = p_wspawn_args->isXYpow2 ? (wg_id / XY) : (wg_id >> p_wspawn_args->log2XY);
int wg_2d = wg_id - k * XY;
int j = p_wspawn_args->isXpow2 ? (wg_2d / X) : (wg_2d >> p_wspawn_args->log2X);
int i = wg_2d - j * X;
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
(p_wspawn_args->wg_func)(p_wspawn_args->args, p_wspawn_args->ctx, gid0, gid1, gid2);
}
vx_tmc(0 == wid);
}
void spawn_kernel_remaining_callback(int nthreads) {
vx_tmc(nthreads);
int core_id = vx_core_id();
int tid = vx_thread_gid();
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[core_id];
int wg_id = p_wspawn_args->offset + tid;
int X = p_wspawn_args->ctx->num_groups[0];
int Y = p_wspawn_args->ctx->num_groups[1];
int XY = X * Y;
int k = p_wspawn_args->isXYpow2 ? (wg_id / XY) : (wg_id >> p_wspawn_args->log2XY);
int wg_2d = wg_id - k * XY;
int j = p_wspawn_args->isXpow2 ? (wg_2d / X) : (wg_2d >> p_wspawn_args->log2X);
int i = wg_2d - j * X;
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
(p_wspawn_args->wg_func)(p_wspawn_args->args, p_wspawn_args->ctx, gid0, gid1, gid2);
vx_tmc(1);
}
void vx_spawn_kernel(struct context_t * ctx, pfn_workgroup_func wg_func, const void * args) {
// total number of WGs
int X = ctx->num_groups[0];
int Y = ctx->num_groups[1];
int Z = ctx->num_groups[2];
int XY = X * Y;
int Q = XY * Z;
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;
int nC = (Q > WT) ? (Q / WT) : 1;
int nc = MIN(nC, NC);
if (core_id >= nc)
return; // terminate extra cores
// number of workgroups per core
int wgs_per_core = Q / nc;
int wgs_per_core0 = wgs_per_core;
if (core_id == (NC-1)) {
int QC_r = Q - (nc * wgs_per_core0);
wgs_per_core0 += QC_r; // last core executes remaining WGs
}
// number of workgroups per warp
int nW = wgs_per_core0 / NT; // total warps per core
int rT = wgs_per_core0 - (nW * NT); // remaining threads
int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations
int rW = (fW != 0) ? (nW - fW * NW) : 0; // reamining full warps
if (0 == fW)
fW = 1;
// fast path handling
char isXYpow2 = is_log2(XY);
char isXpow2 = is_log2(X);
char log2XY = fast_log2(XY);
char log2X = fast_log2(X);
//--
wspawn_kernel_args_t wspawn_args = { ctx, wg_func, args, core_id * wgs_per_core, fW, rW, isXYpow2, isXpow2, log2XY, log2X };
g_wspawn_args[core_id] = &wspawn_args;
//--
if (nW >= 1) {
int nw = MIN(nW, NW);
vx_wspawn(nw, (unsigned)&spawn_kernel_callback);
spawn_kernel_callback();
}
//--
if (rT != 0) {
wspawn_args.offset = wgs_per_core0 - rT;
spawn_kernel_remaining_callback(rT);
}
}
#ifdef __cplusplus
}
#endif

View file

@ -8,12 +8,12 @@ _start:
# execute stack initialization on all warps
la a1, vx_set_sp
csrr a0, CSR_NW # get num warps
.word 0x00b5106b # wspawn a0, a1
.insn s 0x6b, 1, a1, 0(a0) # wspawn a0, a1
jal vx_set_sp
# return back to single thread execution
li a0, 1
.word 0x0005006b # tmc a0
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
# Clear the bss segment
la a0, _edata
@ -44,15 +44,15 @@ _start:
_exit:
# disable all threads in current warp
li a0, 0
.word 0x0005006b # tmc a0
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
.section .text
.type vx_set_sp, @function
.global vx_set_sp
vx_set_sp:
# activate all threads
csrr a0, CSR_NT # get num threads
.word 0x0005006b # set thread mask
csrr a0, CSR_NT # get num threads
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
# set global pointer register
.option push
@ -76,7 +76,7 @@ vx_set_sp:
csrr a3, CSR_LWID # get local wid
beqz a3, RETURN
li a0, 0
.word 0x0005006b # tmc a0
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
RETURN:
ret

View file

@ -8,7 +8,7 @@ CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld
CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections
CFLAGS += -I$(VORTEX_RT_PATH)/include
CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -8,7 +8,7 @@ CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld
CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections
CFLAGS += -I$(VORTEX_RT_PATH)/include
CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

View file

@ -8,7 +8,7 @@ CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld
CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections
CFLAGS += -I$(VORTEX_RT_PATH)/include
CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -8,7 +8,7 @@ CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld
CFLAGS += -nostartfiles -ffreestanding -fno-exceptions -Wl,--gc-sections
CFLAGS += -I$(VORTEX_RT_PATH)/include -I../../../hw
CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff