ETH Price: $4,489.24 (-0.29%)
Gas: 0.25 Gwei

Input Data Messages (IDM)

Decentralized communication on Ethereum.

Filter by:
31,985 IDM
Age:30D

||FILE:asm/visual/ascii_renderer.s||
//==============================================================================
// ASCII Renderer - ARM64 Assembly Implementation
//
// Port of ascii_renderer.c to pure ARM64 assembly
// Functions for bitmap font rendering
//==============================================================================

.text
.align 4

//==============================================================================
// Constants
//==============================================================================
ascii_constants:
char_width: .word 8
char_height: .word 8
vis_width: .word 800
vis_height: .word 600

//==============================================================================
// ASCII Font Data - 8x8 bitmap font
// Each character uses 2 uint32 values (64 bits total)
// 256 characters arranged in 16x16 grid
//==============================================================================
.align 4
ascii_font:
// Character row 0 (chars 0-15)
.word 0x00000000, 0x00000000 // char 0 0x00
.word 0x00000000, 0x00000000 // char 1 0x01
.word 0x00000000, 0x00000000 // char 2 0x02
.word 0x00000000, 0x00000000 // char 3 0x03
.word 0x00000000, 0x00000000 // char 4 0x04
.word 0x00000000, 0x00000000 // char 5 0x05
.word 0x00000000, 0x00000000 // char 6 0x06
.word 0x00000000, 0x00000000 // char 7 0x07
.word 0x00000000, 0x00000000 // char 8 0x08
.word 0x00000000, 0x00000000 // char 9 0x09
.word 0x00000000, 0x00000000 // char 10 0x0a
.word 0x00000000, 0x00000000 // char 11 0x0b
.word 0x00000000, 0x00000000 // char 12 0x0c
.word 0x00000000, 0x00000000 // char 13 0x0d
.word 0x00000000, 0x00000000 // char 14 0x0e
.word 0x00000000, 0x00000000 // char 15 0x0f

// Character row 1 (chars 16-31)
.word 0x00000000, 0x00000000 // char 16 0x10
.word 0x00000000, 0x00000000 // char 17 0x11
.word 0x00000000, 0x00000000 // char 18 0x12
.word 0x00000000, 0x00000000 // char 19 0x13
.word 0x00000000, 0x00000000 // char 20 0x14
.word 0x00000000, 0x00000000 // char 21 0x15
.word 0x00000000, 0x00000000 // char 22 0x16
.word 0x00000000, 0x00000000 // char 23 0x17
.word 0x00000000, 0x00000000 // char 24 0x18
.word 0x00000000, 0x00000000 // char 25 0x19
.word 0x00000000, 0x00000000 // char 26 0x1a
.word 0x00000000, 0x00000000 // char 27 0x1b
.word 0x00000000, 0x00000000 // char 28 0x1c
.word 0x00000000, 0x00000000 // char 29 0x1d
.word 0x00000000, 0x00000000 // char 30 0x1e
.word 0x00000000, 0x00000000 // char 31 0x1f

// Character row 2 (chars 32-47) - Basic symbols
.word 0x00000000, 0x00000000 // char 32 ' ' space
.word 0x00000000, 0x00000000 // char 33 '!'
.word 0x00000000, 0x00000000 // char 34 '"'
.word 0x24247e24, 0x0000247e // char 35 '#' hash
.word 0x00000000, 0x00000000 // char 36 '$'
.word 0x00000000, 0x00000000 // char 37 '%'
.word 0x00000000, 0x00000000 // char 38 '&'
.word 0x00000000, 0x00000000 // char 39 '\''
.word 0x00000000, 0x00000000 // char 40 '('
.word 0x00000000, 0x00000000 // char 41 ')'
.word 0x7c284400, 0x00004428 // char 42 '*' asterisk
.word 0x7c101000, 0x00001010 // char 43 '+' plus
.word 0x00000000, 0x00000000 // char 44 ','
.word 0x7c000000, 0x00000000 // char 45 '-' minus
.word 0x00000000, 0x00000000 // char 46 '.'
.word 0x10080402, 0x00804020 // char 47 '/' slash

// Character row 3 (chars 48-63) - Numbers and symbols
.word 0x42424242, 0x007e4242 // char 48 '0'
.word 0x42424242, 0x007e4242 // char 49 '1'
.word 0x42424242, 0x007e4242 // char 50 '2'
.word 0x42424242, 0x007e4242 // char 51 '3'
.word 0x42424242, 0x007e4242 // char 52 '4'
.word 0x42424242, 0x007e4242 // char 53 '5'
.word 0x42424242, 0x007e4242 // char 54 '6'
.word 0x42424242, 0x007e4242 // char 55 '7'
.word 0x42424242, 0x007e4242 // char 56 '8'
.word 0x42424242, 0x007e4242 // char 57 '9'
.word 0x00000000, 0x00000000 // char 58 ':'
.word 0x00000000, 0x00000000 // char 59 ';'
.word 0x20100800, 0x00000810 // char 60 '<' less than
.word 0x007c0000, 0x0000007c // char 61 '=' equals
.word 0x08102000, 0x00002010 // char 62 '>' greater than
.word 0x00000000, 0x00000000 // char 63 '?'

// Character row 4 (chars 64-79) - @ and uppercase letters A-O
.word 0x00000000, 0x00000000 // char 64 '@'
.word 0x4242427e, 0x007e4242 // char 65 'A'
.word 0x4242427e, 0x007e4242 // char 66 'B'
.word 0x4242427e, 0x007e4242 // char 67 'C'
.word 0x4242427e, 0x007e4242 // char 68 'D'
.word 0x4242427e, 0x007e4242 // char 69 'E'
.word 0x4242427e, 0x007e4242 // char 70 'F'
.word 0x4242427e, 0x007e4242 // char 71 'G'
.word 0x4242427e, 0x007e4242 // char 72 'H'
.word 0x4242427e, 0x007e4242 // char 73 'I'
.word 0x4242427e, 0x007e4242 // char 74 'J'
.word 0x4242427e, 0x007e4242 // char 75 'K'
.word 0x4242427e, 0x007e4242 // char 76 'L'
.word 0x4242427e, 0x007e4242 // char 77 'M'
.word 0x4242427e, 0x007e4242 // char 78 'N'
.word 0x4242427e, 0x007e4242 // char 79 'O'

// Character row 5 (chars 80-95) - Letters P-Z and brackets
.word 0x4242427e, 0x007e4242 // char 80 'P'
.word 0x4242427e, 0x007e4242 // char 81 'Q'
.word 0x4242427e, 0x007e4242 // char 82 'R'
.word 0x4242427e, 0x007e4242 // char 83 'S'
.word 0x4242427e, 0x007e4242 // char 84 'T'
.word 0x4242427e, 0x007e4242 // char 85 'U'
.word 0x4242427e, 0x007e4242 // char 86 'V'
.word 0x4242427e, 0x007e4242 // char 87 'W'
.word 0x4242427e, 0x007e4242 // char 88 'X'
.word 0x4242427e, 0x007e4242 // char 89 'Y'
.word 0x4242427e, 0x007e4242 // char 90 'Z'
.word 0x4040407c, 0x007c4040 // char 91 '[' left bracket
.word 0x10204080, 0x00020408 // char 92 '\' backslash
.word 0x0404047c, 0x007c0404 // char 93 ']' right bracket
.word 0x44281000, 0x00000000 // char 94 '^' caret
.word 0x00000000, 0x007c0000 // char 95 '_' underscore

// Character row 6 (chars 96-111) - lowercase and more symbols
.word 0x00000000, 0x00000000 // char 96 '`'
.word 0x00000000, 0x00000000 // char 97 'a'
.word 0x00000000, 0x00000000 // char 98 'b'
.word 0x00000000, 0x00000000 // char 99 'c'
.word 0x00000000, 0x00000000 // char 100 'd'
.word 0x00000000, 0x00000000 // char 101 'e'
.word 0x00000000, 0x00000000 // char 102 'f'
.word 0x00000000, 0x00000000 // char 103 'g'
.word 0x00000000, 0x00000000 // char 104 'h'
.word 0x00000000, 0x00000000 // char 105 'i'
.word 0x00000000, 0x00000000 // char 106 'j'
.word 0x00000000, 0x00000000 // char 107 'k'
.word 0x00000000, 0x00000000 // char 108 'l'
.word 0x00000000, 0x00000000 // char 109 'm'
.word 0x00000000, 0x00000000 // char 110 'n'
.word 0x00000000, 0x00000000 // char 111 'o'

// Character row 7 (chars 112-127) - more lowercase and special chars
.word 0x00000000, 0x00000000 // char 112 'p'
.word 0x00000000, 0x00000000 // char 113 'q'
.word 0x00000000, 0x00000000 // char 114 'r'
.word 0x00000000, 0x00000000 // char 115 's'
.word 0x00000000, 0x00000000 // char 116 't'
.word 0x00000000, 0x00000000 // char 117 'u'
.word 0x00000000, 0x00000000 // char 118 'v'
.word 0x00000000, 0x00000000 // char 119 'w'
.word 0x00000000, 0x00000000 // char 120 'x'
.word 0x00000000, 0x00000000 // char 121 'y'
.word 0x00000000, 0x00000000 // char 122 'z'
.word 0x4020201c, 0x001c2020 // char 123 '{' left brace
.word 0x10101010, 0x00101010 // char 124 '|' pipe
.word 0x04080870, 0x00700808 // char 125 '}' right brace
.word 0x004c3200, 0x00000000 // char 126 '~' tilde
.word 0x00000000, 0x00000000 // char 127

// Rows 8-15 (chars 128-255) - Extended ASCII filled with blanks
.rept 128
.word 0x00000000, 0x00000000
.endr

//==============================================================================
// void draw_ascii_char_asm(uint32_t *pixels, int x, int y, char c, uint32_t color, int alpha)
//
// Draw a single ASCII character at position with color and alpha blending
// x0: pixels buffer
// w1: x position
// w2: y position
// w3: character (char c)
// w4: color (uint32_t)
// w5: alpha (int, 0-255)
//==============================================================================
.global _draw_ascii_char_asm
_draw_ascii_char_asm:
stp x29, x30, [sp, #-96]!
mov x29, sp

// Save callee-saved registers with proper non-overlapping offsets
stp x19, x20, [sp, #16] // 16-31
stp x21, x22, [sp, #32] // 32-47
stp x23, x24, [sp, #48] // 48-63
stp x25, x26, [sp, #64] // 64-79
stp x27, x28, [sp, #80] // 80-95

// Store parameters in callee-saved registers
mov x19, x0 // pixels buffer
mov w20, w1 // x position
mov w21, w2 // y position
mov w22, w3 // character
mov w23, w4 // color
mov w24, w5 // alpha

// Bounds check: character range 0-255 (now support full range)
cmp w22, #0
b.lt .Ldac_return // if c < 0, return
cmp w22, #255
b.gt .Ldac_return // if c > 255, return

// Bounds check: position within screen
cmp w20, #0
b.lt .Ldac_return
ldr w25, =800 // VIS_WIDTH
cmp w20, w25
b.ge .Ldac_return

cmp w21, #0
b.lt .Ldac_return
ldr w25, =600 // VIS_HEIGHT
cmp w21, w25
b.ge .Ldac_return

// Get character bitmap from new 8x8 font
// Each character uses 2 uint32 values (64 bits)
// Font layout: char_index * 8 bytes = char_index * 2 words
lsl w25, w22, #3 // char_index * 8 (bytes per char)
adr x26, ascii_font // Get font base address
add x26, x26, x25 // Point to character data

// Load character bitmap (2 words = 8 bytes)
ldp w27, w28, [x26] // w27 = first 4 bytes, w28 = last 4 bytes

// OPTIMIZATION: Fast path for alpha==255 (90-95% of calls)
cmp w24, #255
b.eq .Ldac_alpha_opaque

// Alpha blending path for alpha < 255
// Extract RGB components from color for alpha blending
ubfx w25, w23, #16, #8 // r = (color >> 16) & 0xFF
ubfx w26, w23, #8, #8 // g = (color >> 8) & 0xFF
ubfx w27, w23, #0, #8 // b = color & 0xFF

// Apply alpha: component = (component * alpha) / 255
mul w25, w25, w24 // r * alpha
mov w0, #255
udiv w25, w25, w0 // r = (r * alpha) / 255

mul w26, w26, w24 // g * alpha
udiv w26, w26, w0 // g = (g * alpha) / 255

mul w27, w27, w24 // b * alpha
udiv w27, w27, w0 // b = (b * alpha) / 255

// Reconstruct final color: 0xFF000000 | (r << 16) | (g << 8) | b
mov w28, #0xFF
lsl w28, w28, #24 // Alpha = 0xFF000000
lsl w25, w25, #16 // r << 16
lsl w26, w26, #8 // g << 8
orr w28, w28, w25 // Add red
orr w28, w28, w26 // Add green
orr w28, w28, w27 // Add blue - final color in w28
b .Ldac_render_bitmap

.Ldac_alpha_opaque:
// Fast path: alpha==255, use color directly
orr w28, w23, #0xFF000000 // final color = color | 0xFF000000

.Ldac_render_bitmap:

// Now render the 8x8 character
// Reload character bitmap
lsl w25, w22, #3 // char_index * 8
adr x26, ascii_font // Get font base address
add x26, x26, x25
ldp w6, w7, [x26] // w6 = rows 0-3, w7 = rows 4-7

// Render each row of the 8x8 character
mov w0, #0 // row counter
.Ldac_row_loop:
cmp w0, #8
b.ge .Ldac_return

// Get row data - need to extract correct byte from w6 or w7
cmp w0, #4
b.ge .Ldac_upper_rows

// Lower rows (0-3): extract from w6
lsl w2, w0, #3 // row * 8
lsr w1, w6, w2 // shift by row*8 bits
and w1, w1, #0xFF // mask to get byte
b .Ldac_process_row

.Ldac_upper_rows:
// Upper rows (4-7): extract from w7
sub w2, w0, #4 // row - 4
lsl w2, w2, #3 // (row-4) * 8
lsr w1, w7, w2 // shift by (row-4)*8 bits
and w1, w1, #0xFF // mask to get byte

.Ldac_process_row:
// w1 now contains the row bitmap byte
mov w2, #0 // column counter

.Ldac_col_loop:
cmp w2, #8
b.ge .Ldac_next_row

// Check if pixel should be drawn
mov w3, #7
sub w3, w3, w2 // bit position (7-col for MSB first)
lsr w4, w1, w3 // shift pixel bit to position 0
and w4, w4, #1 // mask to get single bit

cbz w4, .Ldac_next_col // skip if pixel is 0

// Calculate pixel position
add w5, w20, w2 // pixel_x = char_x + col
add w6, w21, w0 // pixel_y = char_y + row

// OPTIMIZATION: Skip redundant per-pixel bounds checks
// Caller already guarantees character is fully on-screen

// Calculate pixel offset: y * width + x
ldr w25, =800 // width
mul w6, w6, w25 // y * width
add w6, w6, w5 // + x
lsl w6, w6, #2 // * 4 (bytes per pixel)

// Set pixel
str w28, [x19, x6] // pixels[offset] = color

.Ldac_next_col:
add w2, w2, #1
b .Ldac_col_loop

.Ldac_next_row:
add w0, w0, #1
b .Ldac_row_loop

.Ldac_return:
// Restore callee-saved registers
ldp x27, x28, [sp, #80]
ldp x25, x26, [sp, #64]
ldp x23, x24, [sp, #48]
ldp x21, x22, [sp, #32]
ldp x19, x20, [sp, #16]

ldp x29, x30, [sp], #96
ret
||ENDFILE||
at txn 0x80ab30462c6914f96a87df4a087ddf700820907a2326c2bc35d5a4e00b8af004 Aug-20-2025 12:43:11 AM UTC (28 days ago)

||FILE:asm/active/noise.s||
// AArch64 assembly implementation of noise_block
// void noise_block(rng_t *rng, float *out, uint32_t n)
// rng: pointer to rng_t { uint64_t state; }
// out: pointer to float buffer
// n : number of samples
// Generates white noise in range [-1,1)

.text
.align 2
.globl _noise_block

// 64-bit constants for SplitMix64 algorithm
noise_consts64:
.quad 0x9E3779B97F4A7C15 // GAMMA constant to add to state
.quad 0xBF58476D1CE4E5B9 // MUL1 constant
.quad 0x94D049BB133111EB // MUL2 constant

// float constants
noise_consts32:
.float 5.9604644775390625e-8 // 1/2^24
.float 2.0
.float 1.0

// Registers mapping (inside loop):
// x0 = rng*, x1 = out*, w2 = n, w3 = i (counter)
// x4 = tmp 64-bit value (state/z), x5 = const ptr
// s0 = float value, s1 = inv24, s2 = two, s3 = one

_noise_block:
stp x29, x30, [sp, #-16]!
mov x29, sp

cmp w2, #0 // if n==0, return early
b.eq 2f

// Load pointers to constant tables
adrp x5, noise_consts64@PAGE
add x5, x5, noise_consts64@PAGEOFF

adrp x6, noise_consts32@PAGE
add x6, x6, noise_consts32@PAGEOFF

ldr s1, [x6] // inv24 = 1/16777216
ldr s2, [x6, #4] // 2.0
ldr s3, [x6, #8] // 1.0

mov w3, wzr // i = 0
1: // main loop
// --- SplitMix64 ---
ldr x4, [x0] // load state
ldr x7, [x5] // GAMMA
add x4, x4, x7 // state += GAMMA
str x4, [x0] // store updated state back

// z = state
// z ^= (z >> 30); z *= MUL1;
mov x8, x4
lsr x9, x8, #30
eor x8, x8, x9
ldr x9, [x5, #8] // MUL1
mul x8, x8, x9
// z ^= (z >> 27); z *= MUL2;
lsr x9, x8, #27
eor x8, x8, x9
ldr x9, [x5, #16] // MUL2
mul x8, x8, x9
// z ^= (z >> 31);
lsr x9, x8, #31
eor x8, x8, x9

// Take lower 32 bits and shift right by 8
mov w10, w8 // w10 = low32(z)
lsr w10, w10, #8 // 24-bit value

// Convert to float in [0,1)
ucvtf s0, w10 // s0 = float(u)
fmul s0, s0, s1 // * (1/2^24)

// scale to [-1,1): v = s0 * 2 - 1
fmul s0, s0, s2
fsub s0, s0, s3

str s0, [x1], #4 // store sample and post-inc ptr

// increment counter and loop
add w3, w3, #1
cmp w3, w2
b.lo 1b

2:
ldp x29, x30, [sp], #16
ret ||ENDFILE||
||FILE:asm/active/osc_shapes.s||
.text
.align 2
.globl _osc_saw_block
.globl _osc_square_block
.globl _osc_triangle_block

// Shared constant
osc_TAU_const:
.float 6.2831855
osc_half_TAU:
.float 3.1415927
osc_two_const:
.float 2.0
osc_one_const:
.float 1.0
osc_neg_one_const:
.float -1.0

// Helper macro to compute phase_inc in s2, load TAU in s3
.macro PREP_PHASE
adrp x9, osc_TAU_const@PAGE
add x9, x9, osc_TAU_const@PAGEOFF
ldr s3, [x9]
fdiv s4, s3, s1 // TAU / sr
fmul s2, s4, s0 // phase_inc
ldr s5, [x0] // ph
mov w3, wzr // i=0
.endm

// void osc_saw_block(osc_t*, float*, n, freq, sr)
_osc_saw_block:
stp x29, x30, [sp, #-16]!
mov x29, sp
mov x8, x0
PREP_PHASE
adrp x10, osc_two_const@PAGE
add x10, x10, osc_two_const@PAGEOFF
ldr s6, [x10] // 2.0
0:
cmp w3, w2
b.hs 1f
// frac = ph / TAU => s7
fdiv s7, s5, s3
// out = 2*frac -1
fmul s7, s7, s6
adrp x11, osc_one_const@PAGE
add x11, x11, osc_one_const@PAGEOFF
ldr s8, [x11]
fsub s7, s7, s8
str s7, [x1], #4
// ph += inc; wrap
fadd s5, s5, s2
fcmpe s5, s3
b.lt 2f
fsub s5, s5, s3
2:
add w3, w3, #1
b 0b
1:
str s5, [x8]
ldp x29, x30, [sp], #16
ret

// void osc_square_block(...)
_osc_square_block:
stp x29, x30, [sp, #-16]!
mov x29, sp
mov x8, x0
PREP_PHASE
adrp x12, osc_half_TAU@PAGE
add x12, x12, osc_half_TAU@PAGEOFF
ldr s9, [x12] // half TAU
adrp x13, osc_one_const@PAGE
add x13, x13, osc_one_const@PAGEOFF
ldr s10, [x13]
adrp x14, osc_neg_one_const@PAGE
add x14, x14, osc_neg_one_const@PAGEOFF
ldr s11, [x14]
Lsq_loop:
cmp w3, w2
b.hs Lsq_done
// out = (ph < half_tau) ? 1 : -1
fcmpe s5, s9
fcsel s7, s10, s11, lt
str s7, [x1], #4
fadd s5, s5, s2
fcmpe s5, s3
b.lt Lsq_wrap
fsub s5, s5, s3
Lsq_wrap:
add w3, w3, #1
b Lsq_loop
Lsq_done:
str s5, [x8]
ldp x29, x30, [sp], #16
ret

// void osc_triangle_block(...)
_osc_triangle_block:
stp x29, x30, [sp, #-16]!
mov x29, sp
mov x8, x0
PREP_PHASE
adrp x15, osc_two_const@PAGE
add x15, x15, osc_two_const@PAGEOFF
ldr s6, [x15]
adrp x16, osc_one_const@PAGE
add x16, x16, osc_one_const@PAGEOFF
ldr s7, [x16]
Ltr_loop:
cmp w3, w2
b.hs Ltr_done
// frac = ph/Tau -> s9
fdiv s9, s5, s3
// temp = 2*frac -1
fmul s10, s9, s6
fsub s10, s10, s7
// abs
fabs s10, s10
// val = 2*abs(temp) -1
fmul s10, s10, s6
fsub s10, s10, s7
str s10, [x1], #4
fadd s5, s5, s2
fcmpe s5, s3
b.lt Ltr_wrap
fsub s5, s5, s3
Ltr_wrap:
add w3, w3, #1
b Ltr_loop
Ltr_done:
str s5, [x8]
ldp x29, x30, [sp], #16
ret ||ENDFILE||
||FILE:asm/active/osc_sine.s||
.text
.align 2
.globl _osc_sine_block
// void osc_sine_block(osc_t *o, float *out, uint32_t n, float freq, float sr)
// x0=o, x1=out, w2=n, s0=freq, s1=sr
// Strategy: compute phase_inc = TAU*freq/sr once, then loop scalar using sinf

osc_sine_consts:
.float 6.2831855 // TAU

_osc_sine_block:
// Allocate stack: x29/x30 + phase + phase_inc + TAU (16 + 12 = 28, round to 32)
stp x29, x30, [sp, #-32]!
mov x29, sp

// Save arguments we need to preserve
mov x8, x0 // save osc pointer
mov x9, x1 // save out pointer
mov w10, w2 // save n

// load TAU constant
adrp x11, osc_sine_consts@PAGE
add x11, x11, osc_sine_consts@PAGEOFF
ldr s3, [x11] // TAU

// phase_inc = TAU * freq / sr
fdiv s4, s3, s1 // TAU / sr
fmul s2, s4, s0 // phase_inc

// Store constants on stack
str s2, [sp, #16] // phase_inc at sp+16
str s3, [sp, #20] // TAU at sp+20

// Load initial phase
ldr s0, [x8] // current phase
str s0, [sp, #24] // phase at sp+24

mov w11, wzr // i = 0
cmp w10, #0
b.eq 1f // if n==0 skip loop

0:
// Load phase for sinf
ldr s0, [sp, #24]
bl _sinf
str s0, [x9], #4 // store result and advance pointer

// phase += phase_inc
ldr s0, [sp, #24] // reload phase
ldr s1, [sp, #16] // reload phase_inc
fadd s0, s0, s1

// wrap if phase >= TAU
ldr s2, [sp, #20] // reload TAU
fcmpe s0, s2
b.lt 2f
fsub s0, s0, s2
2:
str s0, [sp, #24] // store updated phase

add w11, w11, #1
cmp w11, w10
b.lo 0b

1:
// store phase back to osc structure
ldr s0, [sp, #24]
str s0, [x8]

ldp x29, x30, [sp], #32
ret ||ENDFILE||
||FILE:asm/active/sin4_ps_asm.s||
.text
.align 2
.globl _sin4_ps_asm
.private_extern sin4_ps_asm_internal
sin4_ps_asm_internal:
_sin4_ps_asm:
// Save callee-saved SIMD registers (v8-v15) - ARM64 ABI requirement
sub sp, sp, #128 // 8×16 = 128 bytes
stp q8, q9, [sp, #0] // save v8, v9
stp q10, q11, [sp, #32] // save v10, v11
stp q12, q13, [sp, #64] // save v12, v13
stp q14, q15, [sp, #96] // save v14, v15

// v0 contains input x; will carry final result.
// Load constant table base
adrp x9, Lsin_const@PAGE
add x9, x9, Lsin_const@PAGEOFF

// y = x * inv_pi
ldr q1, [x9, #0] // inv_pi
fmul v1.4s, v0.4s, v1.4s // v1 = y (float)

// Round y to nearest even integer
frintn v2.4s, v1.4s // v2 = rounded float
fcvtzs v3.4s, v2.4s // v3 = int32 n
// Convert back to float
scvtf v4.4s, v3.4s // v4 = y as float

// x = x - y * pi
ldr q5, [x9, #16] // pi
fmul v6.4s, v4.4s, v5.4s
fsub v0.4s, v0.4s, v6.4s

// swap_sign = n & 1
movi v7.4s, #1
and v8.16b, v3.16b, v7.16b // v8 holds 0 or 1
shl v8.4s, v8.4s, #31 // move to sign bit position

// Toggle sign bit where needed
eor v0.16b, v0.16b, v8.16b

// Polynomial evaluation
// z = x*x
fmul v9.4s, v0.4s, v0.4s // z

ldr q10, [x9, #32] // s2
ldr q11, [x9, #48] // s3
ldr q12, [x9, #64] // s4
ldr q13, [x9, #80] // s1

// y1 = s2 + z*s3
mov v14.16b, v10.16b // y1 = s2
fmla v14.4s, v9.4s, v11.4s // + z*s3

// z2 = z*z
fmul v15.4s, v9.4s, v9.4s // z2
fmla v14.4s, v15.4s, v12.4s // + z2*s4

// y1 += z * s1
fmla v14.4s, v9.4s, v13.4s

// w = z * y1
fmul v16.4s, v9.4s, v14.4s
// x += x * w
fmla v0.4s, v0.4s, v16.4s

// Restore callee-saved SIMD registers
ldp q14, q15, [sp, #96] // restore v14, v15
ldp q12, q13, [sp, #64] // restore v12, v13
ldp q10, q11, [sp, #32] // restore v10, v11
ldp q8, q9, [sp, #0] // restore v8, v9
add sp, sp, #128 // restore stack pointer

ret

// Constant table (16-byte aligned)
.align 4
Lsin_const:
// inv_pi (1/π)
.float 0.31830988618379067154, 0.31830988618379067154, 0.31830988618379067154, 0.31830988618379067154
// pi
.float 3.14159265358979323846, 3.14159265358979323846, 3.14159265358979323846, 3.14159265358979323846
// s2
.float 8.3333337670e-3, 8.3333337670e-3, 8.3333337670e-3, 8.3333337670e-3
// s3
.float -1.9841270114e-4, -1.9841270114e-4, -1.9841270114e-4, -1.9841270114e-4
// s4
.float 2.7557314297e-6, 2.7557314297e-6, 2.7557314297e-6, 2.7557314297e-6
// s1
.float -1.6666664611e-1, -1.6666664611e-1, -1.6666664611e-1, -1.6666664611e-1 ||ENDFILE||
||FILE:asm/active/snare.s||
.text
.align 2
.globl _snare_process

// Lightweight Snare – envelope recurrence + inline SplitMix64 noise
// void snare_process(snare_t *s, float *L, float *R, uint32_t n)
// x0 = snare_t*
// x1 = L buffer
// x2 = R buffer
// w3 = n samples
//
// snare_t layout (see snare.h)
// uint32_t pos @ 0
// uint32_t len @ 4
// float sr @ 8 (unused here)
// float env @ 12
// float env_coef @ 16
// <padding> @ 20
// uint64_t rng.state @ 24

// --- Struct Offsets ---
.equ S_POS, 0
.equ S_LEN, 4
.equ S_ENV, 12
.equ S_ENV_COEF, 16
.equ S_RNG_STATE, 24

// --- Constants ---
AMP_const:
.float 0.4 // overall amplitude

floats_inv24_two_one:
.float 5.9604644775390625e-8 // 1/2^24
.float 2.0
.float 1.0

rng64_consts:
.quad 0x9E3779B97F4A7C15 // GAMMA
.quad 0xBF58476D1CE4E5B9 // MUL1
.quad 0x94D049BB133111EB // MUL2

// ------------------------------------------------------------
// Main routine
// ------------------------------------------------------------
_snare_process:
// Prologue (leaf function ‑ minimal stack)
stp x29, x30, [sp, #-16]!
mov x29, sp
// Save callee-saved x22 which we use as loop counter
str x22, [sp, #-16]!

// Early exit if inactive or n==0
ldr w8, [x0, #S_POS] // pos
ldr w9, [x0, #S_LEN] // len
cmp w8, w9
b.ge Ldone // already finished
cbz w3, Ldone // n == 0

// Load mutable state
ldr s4, [x0, #S_ENV] // env
ldr s5, [x0, #S_ENV_COEF] // env_coef
ldr x10, [x0, #S_RNG_STATE] // rng.state

// Load constants
adrp x11, AMP_const@PAGE
add x11, x11, AMP_const@PAGEOFF
ldr s15, [x11] // AMP

adrp x12, floats_inv24_two_one@PAGE
add x12, x12, floats_inv24_two_one@PAGEOFF
ldr s12, [x12] // inv24
ldr s13, [x12, #4] // 2.0
ldr s14, [x12, #8] // 1.0

adrp x13, rng64_consts@PAGE
add x13, x13, rng64_consts@PAGEOFF

mov w22, wzr // loop counter i

// ------------------------------------------------------------
Lloop:
// Break conditions: i>=n OR pos>=len
cmp w22, w3
b.ge Lend
cmp w8, w9
b.ge Lend

// env *= env_coef
fmul s4, s4, s5

// --- SplitMix64 ---
ldr x14, [x13] // GAMMA
add x10, x10, x14 // state += GAMMA

mov x15, x10 // z = state copy
lsr x16, x15, #30
eor x15, x15, x16
ldr x16, [x13, #8] // MUL1
mul x15, x15, x16

lsr x16, x15, #27
eor x15, x15, x16
ldr x16, [x13, #16] // MUL2
mul x15, x15, x16

lsr x16, x15, #31
eor x15, x15, x16 // final z

// Convert to float in [-1,1)
mov w16, w15 // low 32 bits
lsr w16, w16, #8 // 24-bit mantissa
ucvtf s0, w16 // to float
fmul s0, s0, s12 // *inv24
fmul s0, s0, s13 // *2
fsub s0, s0, s14 // -1

// sample = env * noise * AMP
fmul s0, s0, s4
fmul s0, s0, s15

// L[i] += sample
ldr s1, [x1, w22, sxtw #2]
fadd s1, s1, s0
str s1, [x1, w22, sxtw #2]

// R[i] += sample
ldr s2, [x2, w22, sxtw #2]
fadd s2, s2, s0
str s2, [x2, w22, sxtw #2]

// Advance indices
add w8, w8, #1 // pos++
add w22, w22, #1 // i++
b Lloop

// ------------------------------------------------------------
Lend:
// Store updated state back to struct
str s4, [x0, #S_ENV]
str w8, [x0, #S_POS]
str x10, [x0, #S_RNG_STATE]

Ldone:
ldr x22, [sp], #16
ldp x29, x30, [sp], #16
ret ||ENDFILE||
at txn 0x60799a356730705eb35500a65696934c881519893012fab111862bfbdde1a656 Aug-20-2025 12:42:47 AM UTC (28 days ago)
/ Restore stack

b .Lrot_done

.Lrot_scalar:
// Scalar fallback for arbitrary sizes
// Copy pattern to tmp buffer
mov w4, wzr // w4 = loop counter for memcpy

.Lrot_copy_loop:
cmp w4, w2
b.ge .Lrot_rotate_start
ldrb w5, [x0, w4, uxtw] // Load pattern[i]
strb w5, [x1, w4, uxtw] // Store to tmp[i]
add w4, w4, #1
b .Lrot_copy_loop

.Lrot_rotate_start:
// Rotate: pattern[i] = tmp[(i+rot) % size]
mov w4, wzr // w4 = i (loop counter)

.Lrot_rotate_loop:
cmp w4, w2
b.ge .Lrot_done

// Calculate src_index = (i + rot) % size
add w5, w4, w3 // w5 = i + rot
udiv w6, w5, w2 // w6 = (i + rot) / size
msub w5, w6, w2, w5 // w5 = (i + rot) - (w6 * size) = (i + rot) % size

// pattern[i] = tmp[src_index]
ldrb w6, [x1, w5, uxtw] // Load tmp[src_index]
strb w6, [x0, w4, uxtw] // Store to pattern[i]

add w4, w4, #1 // i++
b .Lrot_rotate_loop

.Lrot_done:
ret

/*
* generator_build_events_asm - Pre-compute entire event queue in assembly
* ----------------------------------------------------------------------
* void generator_build_events_asm(event_queue_t *q, rng_t *rng,
* const uint8_t *kick_pat, const uint8_t *snare_pat, const uint8_t *hat_pat,
* uint32_t step_samples);
*
* Converts the event queue building loop from C to assembly for ultimate performance.
* This is the final orchestration step - building the complete musical timeline.
*
* Event generation rules:
* - Drums: kick/snare/hat based on euclidean patterns
* - Melody: triggers at specific bar positions (0, 8, 16, 24)
* - Mid: stochastic triggers with 10% probability on certain beats
* - Bass: triggers at beginning of each bar (step 0)
*
* Constants:
* - TOTAL_STEPS = 32, STEPS_PER_BAR = 16
* - Event types: KICK=0, SNARE=1, HAT=2, MELODY=3, MID=4, FM_BASS=5
*/

.globl _generator_build_events_asm

_generator_build_events_asm:
// Arguments: x0=q, x1=rng, x2=kick_pat, x3=snare_pat, x4=hat_pat, w5=step_samples

// Save callee-saved registers
stp x19, x20, [sp, #-80]!
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp x25, x26, [sp, #48]
stp x27, x28, [sp, #64]

// Initialize event queue: q->count = 0
str wzr, [x0, #4096] // q->count = 0 (events array is 4096 bytes)

// Register assignments for loop
mov x19, x0 // x19 = q (event queue)
mov x20, x1 // x20 = rng
mov x21, x2 // x21 = kick_pat
mov x22, x3 // x22 = snare_pat
mov x23, x4 // x23 = hat_pat
mov w24, w5 // w24 = step_samples
mov w25, wzr // w25 = step (loop counter)

// Constants
mov w26, #32 // w26 = TOTAL_STEPS
mov w27, #16 // w27 = STEPS_PER_BAR

// RNG_FLOAT constants - removed unused constant

.Lbuild_loop:
// Check loop condition: step < TOTAL_STEPS
cmp w25, w26
b.ge .Lbuild_done

// Calculate t = step * step_samples
mul w6, w25, w24 // w6 = t = step * step_samples

// Calculate bar_step = step % STEPS_PER_BAR
udiv w7, w25, w27 // w7 = step / STEPS_PER_BAR
msub w8, w7, w27, w25 // w8 = bar_step = step - (w7 * STEPS_PER_BAR)

// Check kick pattern: if(kick_pat[step % STEPS_PER_BAR])
ldrb w9, [x21, w8, uxtw] // w9 = kick_pat[bar_step]
cbz w9, .Lcheck_snare

// Push kick event: eq_push(q, t, EVT_KICK, 0)
mov w10, #0 // EVT_KICK = 0
mov w11, #0 // aux = 0
bl _generator_eq_push_helper_asm

.Lcheck_snare:
// Check snare pattern: if(snare_pat[step % STEPS_PER_BAR])
ldrb w9, [x22, w8, uxtw] // w9 = snare_pat[bar_step]
cbz w9, .Lcheck_hat

// Push snare event: eq_push(q, t, EVT_SNARE, 0)
mov w10, #1 // EVT_SNARE = 1
mov w11, #0 // aux = 0
bl _generator_eq_push_helper_asm

.Lcheck_hat:
// Check hat pattern: if(hat_pat[step % STEPS_PER_BAR])
ldrb w9, [x23, w8, uxtw] // w9 = hat_pat[bar_step]
cbz w9, .Lcheck_melody

// Push hat event: eq_push(q, t, EVT_HAT, 0)
mov w10, #2 // EVT_HAT = 2
mov w11, #0 // aux = 0
bl _generator_eq_push_helper_asm

.Lcheck_melody:
// Check melody triggers: if(bar_step==0 || bar_step==8 || bar_step==16 || bar_step==24)
cbz w8, .Lmelody_trigger // bar_step == 0
cmp w8, #8
b.eq .Lmelody_trigger
cmp w8, #16
b.eq .Lmelody_trigger
cmp w8, #24
b.eq .Lmelody_trigger
b .Lcheck_mid

.Lmelody_trigger:
// Push melody event: eq_push(q, t, EVT_MELODY, bar_step/8)
lsr w11, w8, #3 // w11 = aux = bar_step / 8
mov w10, #3 // EVT_MELODY = 3
bl _generator_eq_push_helper_asm

.Lcheck_mid:
// Check mid triggers: if(bar_step % 4 == 2 || ((bar_step%4==1 || bar_step%4==3) && RNG_FLOAT < 0.1))
and w9, w8, #3 // w9 = bar_step % 4
cmp w9, #2
b.eq .Lmid_trigger // bar_step % 4 == 2

// Check if bar_step % 4 == 1 or 3
cmp w9, #1
b.eq .Lmid_rng_check
cmp w9, #3
b.ne .Lcheck_bass

.Lmid_rng_check:
// Generate RNG_FLOAT and compare with 0.1
bl _generator_rng_next_float_asm // Returns float in s0

// Compare with 0.1f
mov w12, #0x3dcc
movk w12, #0xcccd, lsl #16 // 0.1f in IEEE 754
fmov s1, w12
fcmp s0, s1
b.ge .Lcheck_bass // if RNG_FLOAT >= 0.1, skip

.Lmid_trigger:
// Generate random aux value: rng_next_u32() % 7
bl _generator_rng_next_u32_asm // Returns uint32_t in w0
mov w12, #7
udiv w13, w0, w12
msub w11, w13, w12, w0 // w11 = aux = w0 % 7

// Push mid event: eq_push(q, t, EVT_MID, aux)
mov w10, #4 // EVT_MID = 4
bl _generator_eq_push_helper_asm

.Lcheck_bass:
// Check bass trigger: if(bar_step == 0)
cbnz w8, .Lloop_next

// Push bass event: eq_push(q, t, EVT_FM_BASS, 0)
mov w10, #5 // EVT_FM_BASS = 5
mov w11, #0 // aux = 0
bl _generator_eq_push_helper_asm

.Lloop_next:
// Increment step and continue loop
add w25, w25, #1
b .Lbuild_loop

.Lbuild_done:
// Restore callee-saved registers
ldp x27, x28, [sp, #64]
ldp x25, x26, [sp, #48]
ldp x23, x24, [sp, #32]
ldp x21, x22, [sp, #16]
ldp x19, x20, [sp], #80
ret

/*
* Helper function: eq_push equivalent
* Inputs: w6=time, w10=type, w11=aux
* Uses: x19=q
*/
.globl _generator_eq_push_helper_asm
_generator_eq_push_helper_asm:
// Load current count
ldr w12, [x19, #4096] // w12 = q->count

// Check if count < MAX_EVENTS (512)
cmp w12, #512
b.ge .Leq_push_ret // Skip if queue full

// Calculate event address: &q->events[count]
mov w13, #8 // sizeof(event_t) = 8 bytes
mul w14, w12, w13 // w14 = count * sizeof(event_t)
add x15, x19, w14, uxtw // x15 = &q->events[count]

// Store event: {time, type, aux, padding}
str w6, [x15] // event.time = time
strb w10, [x15, #4] // event.type = type
strb w11, [x15, #5] // event.aux = aux

// Increment count
add w12, w12, #1
str w12, [x19, #4096] // q->count++

.Leq_push_ret:
ret

/*
* Helper function: rng_next_u32 equivalent
* Inputs: x20=rng
* Returns: w0=random uint32_t
* Preserves: x20 (rng pointer)
*/
.globl _generator_rng_next_u32_asm
_generator_rng_next_u32_asm:
// Save link register and preserve registers
stp x29, x30, [sp, #-16]!

// Implementation of SplitMix64 algorithm
// uint64_t z = (r->state += 0x9E3779B97F4A7C15ULL);
ldr x0, [x20] // x0 = rng->state
movz x1, #0x7C15, lsl #0
movk x1, #0x7F4A, lsl #16
movk x1, #0xB979, lsl #32
movk x1, #0x9E37, lsl #48 // x1 = 0x9E3779B97F4A7C15
add x0, x0, x1 // x0 = state + increment
str x0, [x20] // rng->state = new state

// z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL;
lsr x1, x0, #30
eor x0, x0, x1
movz x1, #0xE5B9, lsl #0
movk x1, #0x1CE4, lsl #16
movk x1, #0x476D, lsl #32
movk x1, #0xBF58, lsl #48 // x1 = 0xBF58476D1CE4E5B9
mul x0, x0, x1

// z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL;
lsr x1, x0, #27
eor x0, x0, x1
movz x1, #0x11EB, lsl #0
movk x1, #0x3311, lsl #16
movk x1, #0x49BB, lsl #32
movk x1, #0x94D0, lsl #48 // x1 = 0x94D049BB133111EB
mul x0, x0, x1

// return z ^ (z >> 31);
lsr x1, x0, #31
eor x0, x0, x1

// Return lower 32 bits
mov w0, w0

// Restore and return
ldp x29, x30, [sp], #16
ret

/*
* Helper function: rng_next_float equivalent
* Inputs: x20=rng
* Returns: s0=random float [0,1)
*/
.globl _generator_rng_next_float_asm
_generator_rng_next_float_asm:
// Save link register and floating-point context
stp x29, x30, [sp, #-16]!

// Call rng_next_u32
bl _generator_rng_next_u32_asm // w0 = random uint32_t

// Implement: (rng_next_u32(r) >> 8) * (1.0f / 16777216.0f)
lsr w0, w0, #8 // w0 = w0 >> 8
ucvtf s0, w0 // s0 = (float)w0

// Multiply by 1.0f / 16777216.0f = 5.960464477539063e-08
movz w1, #0x0000, lsl #0
movk w1, #0x3380, lsl #16 // IEEE 754 representation of 1.0f/16777216.0f
fmov s1, w1
fmul s0, s0, s1 // s0 = s0 * (1.0f/16777216.0f)

// Restore and return
ldp x29, x30, [sp], #16
ret

.section __TEXT,__cstring
.Ldbg_fmt:
.asciz "ASM: rem=%u proc=%u pos=%u\n"
.Ldbg_pre_fmt:
.asciz "PRE: rem=%u step=%u pos=%u\n"
.Ldbg_post1_fmt:
.asciz "P1: rem=%u proc=%u pos=%u\n"
.Ldbg_rms_fmt:
.asciz "RMSraw=%u\n"
.Ldbg_scratch_fmt:
.asciz "SCR drums=%u synth=%u n=%u\n"
.Loutput_silent_msg:
.asciz "OUTPUT_BUFFER_SILENT\n"
.Loutput_audio_msg:
.asciz "OUTPUT_BUFFER_HAS_AUDIO\n"
.Lmix_null_msg:
.asciz "MIX_NULL_POINTER_ERROR\n"||ENDFILE||
||FILE:asm/active/hat.s||
.text
.align 2
.globl _hat_process

// Lightweight Hat – fast-decay envelope + white noise (SplitMix64)
// void hat_process(hat_t *h, float *L, float *R, uint32_t n)
// x0 = hat_t*
// x1 = L buffer
// x2 = R buffer
// w3 = n samples
//
// hat_t layout (see hat.h)
// uint32_t pos @ 0
// uint32_t len @ 4
// float sr @ 8 (unused)
// float env @ 12
// float env_coef @ 16
// rng_t rng @ 24 (state 64-bit)

// --- Struct Offsets ---
.equ H_POS, 0
.equ H_LEN, 4
.equ H_ENV, 12
.equ H_ENV_COEF, 16
.equ H_RNG_STATE, 24

// --- Constants ---
H_AMP_const:
.float 0.15 // hat amplitude

f_inv24_two_one:
.float 5.9604644775390625e-8 // 1/2^24
.float 2.0
.float 1.0

rng64_consts_hat:
.quad 0x9E3779B97F4A7C15 // GAMMA
.quad 0xBF58476D1CE4E5B9 // MUL1
.quad 0x94D049BB133111EB // MUL2

// ------------------------------------------------------------
_hat_process:
// Prologue: minimal stack frame
stp x29, x30, [sp, #-16]!
mov x29, sp
// Preserve callee-saved x22
str x22, [sp, #-16]!

// Early exits
ldr w8, [x0, #H_POS]
ldr w9, [x0, #H_LEN]
cmp w8, w9
b.ge Ldone // inactive
cbz w3, Ldone // n == 0

// Load state
ldr s4, [x0, #H_ENV]
ldr s5, [x0, #H_ENV_COEF]
ldr x10, [x0, #H_RNG_STATE]

// Constants
adrp x11, H_AMP_const@PAGE
add x11, x11, H_AMP_const@PAGEOFF
ldr s15, [x11] // AMP

adrp x12, f_inv24_two_one@PAGE
add x12, x12, f_inv24_two_one@PAGEOFF
ldr s12, [x12]
ldr s13, [x12, #4]
ldr s14, [x12, #8]

adrp x13, rng64_consts_hat@PAGE
add x13, x13, rng64_consts_hat@PAGEOFF

mov w22, wzr // loop counter i

// --- Main Loop ---
Lloop:
cmp w22, w3
b.ge Lend
cmp w8, w9
b.ge Lend

// env *= env_coef
fmul s4, s4, s5

// SplitMix64
ldr x14, [x13] // GAMMA
add x10, x10, x14
mov x15, x10
lsr x16, x15, #30
eor x15, x15, x16
ldr x16, [x13, #8] // MUL1
mul x15, x15, x16
lsr x16, x15, #27
eor x15, x15, x16
ldr x16, [x13, #16] // MUL2
mul x15, x15, x16
lsr x16, x15, #31
eor x15, x15, x16

mov w16, w15
lsr w16, w16, #8
ucvtf s0, w16
fmul s0, s0, s12
fmul s0, s0, s13
fsub s0, s0, s14

// sample = env*noise*AMP
fmul s0, s0, s4
fmul s0, s0, s15

// L[i] += sample
ldr s1, [x1, w22, sxtw #2]
fadd s1, s1, s0
str s1, [x1, w22, sxtw #2]

// R[i] += sample
ldr s2, [x2, w22, sxtw #2]
fadd s2, s2, s0
str s2, [x2, w22, sxtw #2]

// Advance
add w8, w8, #1
add w22, w22, #1
b Lloop

// --- Exit ---
Lend:
str s4, [x0, #H_ENV]
str w8, [x0, #H_POS]
str x10, [x0, #H_RNG_STATE]

Ldone:
ldr x22, [sp], #16
ldp x29, x30, [sp], #16
ret ||ENDFILE||
||FILE:asm/active/kick.s||
.text
.align 2
.globl _kick_process

// --- Constants ---
AMP_const:
.float 0.9 // overall amplitude (reduced from 1.2 - was too loud)

// Offsets inside kick_t struct (see kick.h)
.equ K_SR, 0 // float sr
.equ K_POS, 4 // uint32 pos
.equ K_LEN, 8 // uint32 len
.equ K_ENV, 12 // float env
.equ K_ENV_COEF, 16 // float env_coef
.equ K_Y_PREV, 28 // float y_prev (sin(theta[n-1]))
.equ K_Y_PREV2, 32 // float y_prev2 (sin(theta[n-2]))
.equ K_K1, 36 // float k1 = 2*cos(delta)

// void kick_process(kick_t *k, float *L, float *R, uint32_t n)
// x0 = kick*, x1 = L*, x2 = R*, w3 = n
_kick_process:
// Prologue (minimal – leaf function, no lib calls)
stp x29, x30, [sp, #-16]!
mov x29, sp
// Preserve callee-saved x22 (used as loop counter)
str x22, [sp, #-16]!

// early-out: inactive or n==0
ldr w9, [x0, #K_POS]
ldr w10,[x0, #K_LEN]
cmp w9, w10
b.ge Ldone // pos >= len
cbz w3, Ldone // n==0

// Load state into FP regs
ldr s4, [x0, #K_ENV] // env
ldr s5, [x0, #K_ENV_COEF] // env_coef
ldr s6, [x0, #K_Y_PREV] // y_prev
ldr s7, [x0, #K_Y_PREV2] // y_prev2
ldr s8, [x0, #K_K1] // k1 = 2*cos(delta)

// Load AMP constant once
adrp x11, AMP_const@PAGE
add x11, x11, AMP_const@PAGEOFF
ldr s15, [x11]

mov w22, wzr // i counter

Lloop:
// Check end conditions
cmp w22, w3
b.ge Lend
cmp w9, w10
b.ge Lend

// env *= env_coef
fmul s4, s4, s5

// y = k1*y_prev - y_prev2 (use s9 temps)
fmul s9, s8, s6 // k1*y_prev
fsub s9, s9, s7 // - y_prev2 -> y

// sample = env * y * AMP
fmul s0, s4, s9
fmul s0, s0, s15

// L[i] += sample
ldr s1, [x1, w22, sxtw #2]
fadd s1, s1, s0
str s1, [x1, w22, sxtw #2]

// R[i] += sample
ldr s2, [x2, w22, sxtw #2]
fadd s2, s2, s0
str s2, [x2, w22, sxtw #2]

// Update sine recurrence state
fmov s7, s6 // y_prev2 = old y_prev
fmov s6, s9 // y_prev = y

// Increment counters
add w9, w9, #1 // pos++
add w22, w22, #1 // i++
b Lloop

Lend:
// Store back updated state
str s4, [x0, #K_ENV]
str s6, [x0, #K_Y_PREV]
str s7, [x0, #K_Y_PREV2]
str w9, [x0, #K_POS]

Ldone:
ldr x22, [sp], #16
ldp x29, x30, [sp], #16
ret ||ENDFILE||
||FILE:asm/active/limiter.s||
// AArch64 assembly implementation of soft-knee limiter
// void limiter_process(limiter_t *l, float *L, float *R, uint32_t n)
// l: x0 = pointer to limiter_t { attack_coeff, release_coeff, envelope, threshold, knee_width }
// L: x1 = left channel buffer
// R: x2 = right channel buffer
// n: w3 = number of samples
//
// Uses soft-knee compression with envelope follower

.text
.align 2
.globl _limiter_process

limiter_consts:
.float 20.0
.float 2.0
.float 10.0
.float -0.5
.float 1.0

_limiter_process:
// Save registers we'll use
stp x29, x30, [sp, #-96]!
stp x19, x20, [sp, #16]
stp x21, x22, [sp, #32]
mov x29, sp

// Early exit if n == 0
cbz w3, done

// Save arguments
mov x19, x0 // limiter struct
mov x20, x1 // L pointer
mov x21, x2 // R pointer
mov w22, w3 // n

// Load limiter parameters
ldr s0, [x19] // attack_coeff
ldr s1, [x19, #4] // release_coeff
ldr s2, [x19, #8] // envelope
ldr s3, [x19, #12] // threshold
ldr s4, [x19, #16] // knee_width

// Store parameters on stack for reloading after function calls
str s0, [sp, #48] // attack_coeff
str s1, [sp, #52] // release_coeff
str s3, [sp, #56] // threshold
str s4, [sp, #60] // knee_width
str s2, [sp, #64] // envelope (will be updated)

// Calculate knee bounds and store
fmov s5, #-0.5
fmul s5, s4, s5 // -knee_width/2
str s5, [sp, #68]
fneg s6, s5 // knee_width/2
str s6, [sp, #72]

// Main loop
mov w23, wzr // i = 0

loop:
// Break conditions
cmp w23, w22
b.hs Lstore_env // i >= n → exit loop

// Load samples
ldr s7, [x20, w23, sxtw #2] // L[i]
ldr s8, [x21, w23, sxtw #2] // R[i]

// Get absolute values and peak
fabs s9, s7 // |L[i]|
fabs s10, s8 // |R[i]|
fmax s11, s9, s10 // peak = max(|L|, |R|)

// Load current envelope
ldr s2, [sp, #64]

// Envelope follower
fcmp s11, s2
b.le 1f

// Attack: env = peak + att * (env - peak)
ldr s0, [sp, #48] // attack_coeff
fsub s12, s2, s11
fmadd s2, s0, s12, s11
b 2f

1: // Release: env = peak + rel * (env - peak)
ldr s1, [sp, #52] // release_coeff
fsub s12, s2, s11
fmadd s2, s1, s12, s11

2: // Store updated envelope
str s2, [sp, #64]

// Calculate overshoot_db = 20 * log10(env / thresh)
ldr s3, [sp, #56] // threshold
fdiv s0, s2, s3 // env / thresh

// Save sample values before function call
str s7, [sp, #76] // L[i]
str s8, [sp, #80] // R[i]

// Call log10f
bl _log10f
fmov s12, #20.0
fmul s13, s0, s12 // overshoot_db = 20 * log10(...)

// Reload samples
ldr s7, [sp, #76]
ldr s8, [sp, #80]

// Calculate gain_reduction_db
fmov s14, wzr // gain_reduction_db = 0

ldr s5, [sp, #68] // -knee_width/2
fcmp s13, s5
b.le 3f // no reduction if below knee

ldr s6, [sp, #72] // knee_width/2
fcmp s13, s6
b.ge 4f // hard limit if above knee

// Soft knee calculation
fsub s15, s13, s5 // overshoot_db + knee_width/2
fmul s15, s15, s15 // square it
ldr s4, [sp, #60] // knee_width
fmov s16, #2.0
fmul s16, s16, s4 // 2 * knee_width
fdiv s14, s15, s16 // gain_reduction_db
b 3f

4: // Hard limiting
fmov s14, s13

3: // Convert to linear gain
fneg s15, s14 // -gain_reduction_db
fmov s16, #20.0
fdiv s1, s15, s16 // -gain_reduction_db / 20

// Save before powf
str s7, [sp, #76]
str s8, [sp, #80]

// Call powf(10, exponent)
fmov s0, #10.0
bl _powf
fmov s17, s0 // gain

// Reload samples
ldr s7, [sp, #76]
ldr s8, [sp, #80]

// Apply gain if < 1.0
fmov s18, #1.0
fcmp s17, s18
b.ge 5f

fmul s7, s7, s17
fmul s8, s8, s17

5: // Store processed samples
str s7, [x20, w23, sxtw #2]
str s8, [x21, w23, sxtw #2]

// Loop control
add w23, w23, #1
cmp w23, w22
b.lo loop

Lstore_env:
// Store final envelope
ldr s2, [sp, #64]
str s2, [x19, #8]

// Fallthrough
done:
// Restore registers
ldp x19, x20, [sp, #16]
ldp x21, x22, [sp, #32]
ldp x29, x30, [sp], #96
ret ||ENDFILE||
||FILE:asm/active/melody.s||
.text
.align 2
.globl _melody_process

// Simplified melody implementation avoiding libm calls
// Uses polynomial approximation for exponential decay
melody_constants:
.float 6.2831853071795864769 // [0] TAU
.float 5.0 // [4] DECAY_RATE
.float 0.07 // [8] AMP
.float 2.0 // [12] TWO
.float 1.0 // [16] ONE
.float 1.2 // [20] DRIVE_GAIN
.float 1.5 // [24] SOFT_A
.float 0.5 // [28] SOFT_B

_melody_process:
// Prologue - simpler frame
stp x29, x30, [sp, #-80]!
mov x29, sp
stp x19, x20, [sp, #16]
stp x21, x22, [sp, #32]
stp x23, x24, [sp, #48]

// Save arguments
mov x19, x0 // melody*
mov x23, x1 // L*
mov x24, x2 // R*
mov w22, w3 // n

// Load melody struct members
ldr s0, [x19] // phase
ldr w20, [x19, #4] // pos
ldr w21, [x19, #8] // len
ldr s1, [x19, #12] // sr
ldr s2, [x19, #16] // freq

// Early exit checks
cmp w20, w21
b.ge Ldone
cbz w22, Ldone

// Load constants
adrp x4, melody_constants@PAGE
add x4, x4, melody_constants@PAGEOFF
ldr s10, [x4] // TAU
ldr s11, [x4, #4] // DECAY_RATE
ldr s12, [x4, #8] // AMP
ldr s13, [x4, #12] // TWO
ldr s14, [x4, #16] // ONE
ldr s15, [x4, #20] // DRIVE_GAIN
ldr s16, [x4, #24] // SOFT_A
ldr s17, [x4, #28] // SOFT_B

// Calculate phase increment: TAU * freq / sr
fmul s3, s10, s2 // TAU * freq
fdiv s3, s3, s1 // / sr -> phase_inc in s3

mov w4, wzr // loop counter i

Lloop:
// Loop bounds check
cmp w4, w22
b.ge Lend
cmp w20, w21
b.ge Lend

// Calculate time: t = pos / sr
ucvtf s4, w20
fdiv s4, s4, s1 // t in s4

// Simple exponential decay approximation: env = 1.0 / (1.0 + decay_rate * t)
fmul s5, s11, s4 // decay_rate * t
fadd s5, s14, s5 // 1.0 + decay_rate * t
fdiv s5, s14, s5 // env = 1.0 / (1.0 + decay_rate * t)

// Calculate sawtooth: frac = phase / TAU
fdiv s6, s0, s10 // frac

// raw = 2*frac - 1
fmul s7, s6, s13 // 2*frac
fsub s7, s7, s14 // -1 -> raw sawtooth

// Apply drive: driven = 1.2 * raw
fmul s8, s15, s7 // driven

// Soft clipping: soft = 1.5*driven - 0.5*driven^3
fmul s9, s8, s8 // driven^2
fmul s9, s9, s8 // driven^3
fmul s18, s16, s8 // 1.5*driven
fmul s19, s17, s9 // 0.5*driven^3
fsub s18, s18, s19 // soft = 1.5*driven - 0.5*driven^3

// Final sample: sample = soft * env * amp
fmul s18, s18, s5 // * env
fmul s18, s18, s12 // * amp

// Add to L[i] and R[i]
ldr s19, [x23, w4, sxtw #2]
fadd s19, s19, s18
str s19, [x23, w4, sxtw #2]

ldr s19, [x24, w4, sxtw #2]
fadd s19, s19, s18
str s19, [x24, w4, sxtw #2]

// Update counters and phase
add w4, w4, #1 // i++
add w20, w20, #1 // pos++
fadd s0, s0, s3 // phase += phase_inc

// Wrap phase properly in both directions
fcmp s0, s10
b.lt .check_negative_wrap
fsub s0, s0, s10 // phase -= TAU if phase >= TAU
b Lloop

.check_negative_wrap:
fmov s16, wzr // 0.0
fcmp s0, s16
b.ge Lloop
fadd s0, s0, s10 // phase += TAU if phase < 0
b Lloop

Lend:
// Store back state
str s0, [x19] // phase
str w20, [x19, #4] // pos

Ldone:
// Epilogue
ldp x23, x24, [sp, #48]
ldp x21, x22, [sp, #32]
ldp x19, x20, [sp, #16]
ldp x29, x30, [sp], #80
ret
||ENDFILE||
at txn 0x5ad3952994bcbb53d81958c303cd1176ae43fb2ebd3d24f6694329c0fcd8f140 Aug-20-2025 12:42:35 AM UTC (28 days ago)

||FILE:asm/active/generator.s||
.text
.align 2
.globl _generator_mix_buffers_asm

// Assembly stubs – we override only generator_process; keep C generator_init
.globl _generator_process
// (no _generator_init symbol here; C version remains)

_generator_process:
// Args: x0 = g, x1 = L, x2 = R, w3 = num_frames

// Prologue – save frame pointer & callee-saved regs (x19-x22)
stp x29, x30, [sp, #-128]! // reserve 128-byte fixed frame (was 96)
mov x29, sp
stp x19, x20, [sp, #16]
stp x21, x22, [sp, #32]
stp x23, x24, [sp, #48]
stp x25, x26, [sp, #64]
stp x27, x28, [sp, #80]

// Early exit if no frames
cbz w3, .Lgp_epilogue

// Preserve output buffer pointers before we repurpose argument regs
mov x19, x1 // L
mov x20, x2 // R
mov x24, x0 // g pointer
// (dup line removed)
// (generator pointer already preserved earlier)

// Preserve num_frames in callee-saved x21 for later reuse
mov x21, x3 // x21 = num_frames (32-bit valid)

// ---------------------------------------------------------------------
// Allocate contiguous scratch block on heap instead of stack
// scratch_size = (num_frames * 16 + 15) & ~15 // 16-byte alignment
lsl x22, x21, #4 // x22 = num_frames * 16 (bytes for 4 buffers)
add x22, x22, #15
bic x22, x22, #15 // align to 16 bytes

// malloc(scratch_size)
mov x0, x22
bl _malloc
mov x25, x0 // x25 = Ld base (scratch start)

// TEMP: Check if malloc failed
cbz x25, .Lgp_epilogue // if malloc returned NULL, exit immediately

// bytes_per_buffer = num_frames * 4
lsl x5, x21, #2 // x5 = bytes per buffer

// Compute remaining scratch pointers
add x26, x25, x5 // Rd = Ld + size
add x27, x26, x5 // Ls
add x28, x27, x5 // Rs

// Prepare arguments for generator_clear_buffers_asm
mov x0, x25 // Ld
add x1, x0, x5 // Rd = Ld + bytes_per_buffer
add x2, x1, x5 // Ls
add x3, x2, x5 // Rs
mov w4, w21 // num_frames

stp x21, x22, [sp, #96] // save frames_rem & x22 inside fixed frame
bl _generator_clear_buffers_asm
ldp x21, x22, [sp, #96] // restore w21, x22 (sp unchanged)

// ---------------------------------------------------------------------
// Skip explicit memset on L/R since they will be fully overwritten by
// subsequent processing/mixing.

// ---------------------------------------------------------------------
// Slice-2: Outer frame loop over musical steps (state advance only)

// Register assignments:
// x24 = g (generator*) – set now
// Use x10 as pointer to timing/event fields (base = g + 4352)
add x10, x24, #0x1000 // x10 = g + 4096
add x10, x10, #0x128 // +296 => g + 4392 (event_idx)

ldr w9, [x24, #12] // w9 = step_samples (offset 12 bytes)
ldr w8, [x10, #8] // w8 = pos_in_step (event base + 8)

// TOTAL_STEPS constant
mov w13, #32 // for wrap-around comparison

// After generator_clear_buffers_asm call and before outer loop label
// Save scratch base pointers for later voice processing
// x25..x28 already set to scratch pointers

mov w23, wzr // frames_done = 0 (will live in w23/x23)

.Lgp_loop:
ldr w9, [x24, #12] // reload step_samples each iteration
cbz w21, .Lgp_after_loop // frames_rem == 0 ? done

// ----- DEBUG: dump counters at loop start -----
.if 0
stp x0, x1, [sp, #-16]! // save caller-saved regs we'll clobber
stp x2, x3, [sp, #-16]!
adrp x0, .Ldbg_pre_fmt@PAGE
add x0, x0, .Ldbg_pre_fmt@PAGEOFF
mov w1, w21 // frames_rem
mov w2, w9 // step_samples
mov w3, w8 // pos_in_step
bl _printf
ldp x2, x3, [sp], #16 // restore
ldp x0, x1, [sp], #16 // restore
.endif
// ---------------------------------------------

// Slice-3: Trigger events at step start
cbnz w8, .Lgp_trigger_skip // Only trigger when pos_in_step == 0
// Preserve caller-saved x8/x9 that hold pos_in_step & scratch before calling C helper
stp x8, x9, [sp, #112] // save into fixed 128-byte frame (offsets 112-127)
mov x0, x24 // x0 = g pointer
bl _generator_trigger_step
ldp x8, x9, [sp, #112] // restore registers (keeps sp constant)

.Lgp_trigger_skip:
// Recompute event/state base pointer after external calls may clobber x10
add x10, x24, #0x1000
add x10, x10, #0x128 // x10 = &g->event_idx
ldr w9, [x24, #12]

// Reload constant step_samples in case caller-saved w9 was clobbered
ldr w9, [x24, #12] // w9 = step_samples (offset 12 bytes)
// frames_to_step_boundary = step_samples - pos_in_step
sub w10, w9, w8 // w10 = frames_to_step_boundary (no slice-shortening)
// FM sustain fix: if pos_in_step == 0 and frames_to_step_boundary > 1, decrement by 1 so
// that voices (particularly fm_voice) spread notes over at least two slices. This mirrors
// the logic added in generator.c (Round 23 fix).
cbnz w8, 1f // if pos_in_step != 0, skip
cmp w10, #1
ble 1f // if boundary <=1, nothing to shorten
sub w10, w10, #1 // frames_to_step_boundary -= 1
1:
// frames_to_process = min(frames_rem, frames_to_step_boundary)
cmp w21, w10
b.lt 1f
mov w11, w10
b 2f
1: mov w11, w21
2:
// ---------- Slice-4: Voice processing + mixing ----------
// Save frames_to_process into callee-saved x22 to survive C call (zero-extend to avoid garbage high bits)
mov w22, w11 // preserve w11, zeroing upper 32 bits of x22

// Compute byte offset into scratch/output for frames_done
lsl x12, x23, #2 // x12 = frames_done * 4 (bytes)

// Scratch pointers for this sub-block
add x13, x25, x12 // Ld ptr
add x14, x26, x12 // Rd ptr
add x15, x27, x12 // Ls ptr
add x16, x28, x12 // Rs ptr
// Output pointers
add x17, x19, x12 // L dest
add x18, x20, x12 // R dest

// Call voice processor (preserve x21 across call)
stp x21, x22, [sp, #96] // save frames_rem & x22 inside fixed frame

// Replace C voice processing with ASM voice processing calls

// Process kick into drum buffers (Ld/Rd)
add x0, x24, #56 // kick offset (from generator_t)
mov x1, x13 // Ld
mov x2, x14 // Rd
mov w3, w11 // num_frames
bl _kick_process

// Remove debug for now - focus on testing audio output

// Process snare into drum buffers
add x0, x24, #96 // snare offset
mov x1, x13 // Ld
mov x2, x14 // Rd
mov w3, w11 // num_frames
bl _snare_process

// Process melody into synth buffers (Ls/Rs)
add x0, x24, #160 // melody offset
mov x1, x15 // Ls
mov x2, x16 // Rs
mov w3, w11 // num_frames
bl _melody_process

// Process FM voices into synth buffers
add x0, x24, #180 // mid_fm offset
mov x1, x15 // Ls
mov x2, x16 // Rs
mov w3, w11 // num_frames
bl _fm_voice_process

add x0, x24, #220 // bass_fm offset
mov x1, x15 // Ls
mov x2, x16 // Rs
mov w3, w11 // num_frames
bl _fm_voice_process

ldp x21, x22, [sp, #96] // restore w21, x22 (sp unchanged)

// Recompute event/state base pointer after _generator_process_voices (x10 may be clobbered)
add x10, x24, #0x1000
add x10, x10, #0x128 // x10 = &g->event_idx

// Restore w11 from x22 after helper
mov w11, w22 // restore frames_to_process
// Reload pos_in_step since w8 is caller-clobbered
ldr w8, [x10, #8]

// ----- TRACE1: after voice processing -----
.if 0
stp x0, x1, [sp, #-16]! // save regs clobbered by printf
adrp x0, .Ldbg_post1_fmt@PAGE
add x0, x0, .Ldbg_post1_fmt@PAGEOFF
mov w1, w21 // frames_rem (remaining)
mov w2, w11 // frames_to_process for this slice
mov w3, w8 // current pos_in_step
bl _printf
ldp x0, x1, [sp], #16 // restore regs
.endif

// TEMP: Remove offset calculation to test if that's the issue
lsl x12, x23, #2 // byte offset = frames_done * 4
add x13, x25, x12 // Ld ptr
add x14, x26, x12 // Rd ptr
add x15, x27, x12 // Ls ptr
add x16, x28, x12 // Rs ptr
add x17, x19, x12 // L dest = L_base + offset
add x18, x20, x12 // R dest = R_base + offset

// Mix drums + synths into output buffers
mov x0, x17 // L out
mov x1, x18 // R out
mov x2, x13 // Ld
mov x3, x14 // Rd
mov x4, x15 // Ls
mov x5, x16 // Rs
mov w6, w11 // num_frames
// Just call the mixing function (debug later)
bl _generator_mix_buffers_asm

// Re-enable debug check but only for first slice
.if 0
cbnz w23, .Lskip_output_check // only when frames_done == 0
// cbnz w23, .Lskip_output_check // DISABLED: run on every slice

// Check first few samples of L output buffer as integers (simpler)
ldr w0, [x17] // Load L[0] as int
ldr w1, [x17, #4] // Load L[1] as int
ldr w2, [x17, #8] // Load L[2] as int

// Simple check: if any sample != 0, we have audio
orr w0, w0, w1 // w0 = L[0] | L[1]
orr w0, w0, w2 // w0 = L[0] | L[1] | L[2]
cbnz w0, .Loutput_has_audio

// Output is silent
adrp x0, .Loutput_silent_msg@PAGE
add x0, x0, .Loutput_silent_msg@PAGEOFF
mov w1, #22
mov x2, #2 // stderr
mov x16, #4 // sys_write
svc #0x80
b .Lskip_output_check

.Loutput_has_audio:
adrp x0, .Loutput_audio_msg@PAGE
add x0, x0, .Loutput_audio_msg@PAGEOFF
mov w1, #24
mov x2, #2 // stderr
mov x16, #4 // sys_write
svc #0x80

.Lskip_output_check:
.endif

// ----- SCRATCH RMS PROBE (debug – first slice only) -----
.if 0
cbnz w23, .Lskip_scratch_rms // only when frames_done == 0

// Save caller-saved regs we will clobber (x0-x3)
stp x0, x1, [sp, #-16]!
stp x2, x3, [sp, #-16]!

// ---- drums scratch RMS (Ld/Rd) ----
mov x0, x13 // Ld
mov x1, x14 // Rd
mov w2, w11 // num_frames in slice
bl _generator_compute_rms_asm // s0 = RMS
fmov w4, s0 // raw bits -> w4

// ---- synth scratch RMS (Ls/Rs) ----
mov x0, x15 // Ls
mov x1, x16 // Rs
mov w2, w11
bl _generator_compute_rms_asm // s0 = RMS
fmov w5, s0 // raw bits -> w5

// printf("SCR drums=%u synth=%u n=%u\n", drums_bits, synth_bits, frames_to_process)
adrp x0, .Ldbg_scratch_fmt@PAGE
add x0, x0, .Ldbg_scratch_fmt@PAGEOFF
mov w1, w4
mov w2, w5
mov w3, w11
bl _printf

// Restore clobbered regs
ldp x2, x3, [sp], #16
ldp x0, x1, [sp], #16

.Lskip_scratch_rms:
.endif

// ----- RMS DEBUG -----
.if 0
// Save x0–x3 into unused area of 128-byte fixed frame (keeps sp constant)
stp x0, x1, [x29, #96]
stp x2, x3, [x29, #112]

mov x0, x17 // L buffer pointer
mov x1, x18 // R buffer pointer
mov w2, w11 // num_frames this slice
bl _generator_compute_rms_asm // s0 = RMS (float)

// Store RMS for real-time visual feedback
adrp x9, _g_block_rms@PAGE
add x9, x9, _g_block_rms@PAGEOFF
fmov s1, #0.5 // Fixed RMS for visuals
str s1, [x9] // g_block_rms = 0.5 (safe)

// Print raw IEEE bits so we avoid float formatting overhead
fmov w1, s0 // RMS bits → w1
adrp x0, .Ldbg_rms_fmt@PAGE // format string "%u\n"
add x0, x0, .Ldbg_rms_fmt@PAGEOFF
bl _printf

// Restore x0–x3
ldp x2, x3, [x29, #112]
ldp x0, x1, [x29, #96]
.endif
// ----- END RMS DEBUG -----

// DEBUG PRINT BEGIN
.if 0
stp x21, x22, [sp, #-16]! // save frames_rem and spare callee-saved slot
stp x8, x11, [sp, #-16]! // save live regs for printf args
adrp x0, .Ldbg_fmt@PAGE
add x0, x0, .Ldbg_fmt@PAGEOFF
mov w1, w21 // frames_rem
mov w2, w11 // frames_to_process
mov w3, w8 // pos_in_step
bl _printf
ldp x8, x11, [sp], #16 // restore
ldp x21, x22, [sp], #16 // restore frames_rem
.endif
// DEBUG PRINT END

// Advance counters
add w8, w8, w11 // pos_in_step += frames_to_process
// write back updated pos_in_step to struct
add x10, x24, #0x1000
add x10, x10, #0x128 // correct base for event/state block (g + 4392)
str w8, [x10, #8]
sub w21, w21, w11 // frames_rem -= frames_to_process
add w23, w23, w11 // frames_done += frames_to_process

// Check if step boundary reached
cmp w8, w9
b.lt .Lgp_loop

// Boundary reached – reset pos_in_step and advance step
mov w8, wzr
// Recompute event/state base pointer again (x10 may be clobbered by helpers)
add x10, x24, #0x1000
add x10, x10, #0x128 // x10 = &g->event_idx
str w8, [x10, #8] // write back pos_in_step = 0 to generator struct
ldr w12, [x10, #4] // w12 = step (event base + 4)
add w12, w12, #1
cmp w12, w13
b.lt 3f
mov w12, wzr
str wzr, [x10] // event_idx reset
3: str w12, [x10, #4]
b .Lgp_loop

.Lgp_after_loop:
// Store updated pos_in_step back
str w8, [x10, #8]

// Deallocate scratch (free)
mov x0, x25
bl _free

// TEMP: Skip delay & limiter to test if they're clearing audio
b .Lgp_epilogue

// ---------------------------------------------------------------------
// Slice-5: Apply Delay & Limiter (C implementations)
// --------------------------------------------------
// delay_process_block(&g->delay, L, R, num_frames, 0.45f);
// limiter_process(&g->limiter, L, R, num_frames);

// Prepare arguments for delay_process_block
// x24 = g (preserved), x19 = L buffer, x20 = R buffer, w23 = total num_frames

// x0 = &g->delay (offset 4408 bytes)
add x0, x24, #4096 // base offset
add x0, x0, #312 // 4096 + 312 = 4408
mov x1, x19 // L
mov x2, x20 // R
mov w3, w23 // n = num_frames
// s0 = 0.45f (IEEE-754 0x3EE66666)
mov w4, #0x6666
movk w4, #0x3EE6, lsl #16
fmov s0, w4
bl _delay_process_block

#ifndef SKIP_LIMITER
// Prepare arguments for limiter_process
// x0 = &g->limiter (offset 4424 bytes)
add x0, x24, #4096 // base offset
add x0, x0, #328 // 4096 + 328 = 4424
mov x1, x19 // L
mov x2, x20 // R
mov w3, w23 // n = num_frames
bl _limiter_process
#endif

// Existing epilogue label below handles register restore and return

.Lgp_epilogue:
// Early-exit path: deallocate scratch skipped (not allocated)
ldp x27, x28, [x29, #80]
ldp x25, x26, [x29, #64]
ldp x23, x24, [x29, #48]
ldp x21, x22, [x29, #32]
ldp x19, x20, [x29, #16]
ldp x29, x30, [sp], #128 // pop full 128-byte frame
ret

/*
* generator_mix_buffers_asm - NEON vectorized buffer mixing
* -------------------------------------------------------
* void generator_mix_buffers_asm(float *L, float *R,
* const float *Ld, const float *Rd,
* const float *Ls, const float *Rs,
* uint32_t num_frames);
*
* Performs: L[i] = Ld[i] + Ls[i] (drums + synths)
* R[i] = Rd[i] + Rs[i] (drums + synths)
*
* Uses NEON to process 4 samples per iteration for maximum throughput.
* This is the hot path that runs every audio frame in real-time.
*/

_generator_mix_buffers_asm:
// Arguments: x0=L, x1=R, x2=Ld, x3=Rd, x4=Ls, x5=Rs, w6=num_frames

// TEMP: Check for zero frame count (common cause of silent exit)
cbnz w6, .Lmix_continue
ret // Exit immediately if zero frames - but this is normal
.Lmix_continue:

// Early exit if no frames to process
cbz w6, .Lmix_done

// Calculate how many complete NEON vectors (4 samples) we can process
lsr w7, w6, #2 // w7 = num_frames / 4 (complete vectors)
and w8, w6, #3 // w8 = num_frames % 4 (remainder samples)

// Process complete 4-sample vectors with NEON
cbz w7, .Lmix_scalar // Skip if no complete vectors

.Lmix_vector_loop:
// Load 4 samples from each source buffer
ld1 {v0.4s}, [x2], #16 // v0 = Ld[i..i+3], advance pointer
ld1 {v1.4s}, [x3], #16 // v1 = Rd[i..i+3], advance pointer
ld1 {v2.4s}, [x4], #16 // v2 = Ls[i..i+3], advance pointer
ld1 {v3.4s}, [x5], #16 // v3 = Rs[i..i+3], advance pointer

// Vector addition: drums + synths
fadd v4.4s, v0.4s, v2.4s // v4 = Ld + Ls
fadd v5.4s, v1.4s, v3.4s // v5 = Rd + Rs

// Store results to output buffers
st1 {v4.4s}, [x0], #16 // L[i..i+3] = v4, advance pointer
st1 {v5.4s}, [x1], #16 // R[i..i+3] = v5, advance pointer

// Loop control
subs w7, w7, #1
b.ne .Lmix_vector_loop

.Lmix_scalar:
// Handle remaining samples (0-3) with scalar operations
cbz w8, .Lmix_done

.Lmix_scalar_loop:
// Load single samples
ldr s0, [x2], #4 // s0 = Ld[i]
ldr s1, [x3], #4 // s1 = Rd[i]
ldr s2, [x4], #4 // s2 = Ls[i]
ldr s3, [x5], #4 // s3 = Rs[i]

// Scalar addition
fadd s4, s0, s2 // s4 = Ld[i] + Ls[i]
fadd s5, s1, s3 // s5 = Rd[i] + Rs[i]

// Store results
str s4, [x0], #4 // L[i] = s4
str s5, [x1], #4 // R[i] = s5

// Loop control
subs w8, w8, #1
b.ne .Lmix_scalar_loop

.Lmix_done:
ret

.Lmix_null_error:
// Safe debug message for NULL pointer error
stp x0, x1, [sp, #-16]!
adrp x0, .Lmix_null_msg@PAGE
add x0, x0, .Lmix_null_msg@PAGEOFF
mov w1, #22
mov x2, #2 // stderr
mov x16, #4 // sys_write
svc #0x80
ldp x0, x1, [sp], #16
ret

/*
* generator_compute_rms_asm - NEON vectorized RMS calculation
* ---------------------------------------------------------
* float generator_compute_rms_asm(const float *L, const float *R, uint32_t num_frames);
*
* Computes RMS = sqrt(sum(L[i]² + R[i]²) / (num_frames * 2))
*
* Uses NEON to process 4 samples per iteration:
* - Load L[i..i+3] and R[i..i+3]
* - Square each (fmul)
* - Add L² + R² (fadd)
* - Accumulate in vector sum
* - Final horizontal sum + sqrt in scalar
*/

.globl _generator_compute_rms_asm

_generator_compute_rms_asm:
// Arguments: x0=L, x1=R, w2=num_frames
// Returns: s0 = RMS value

// Early exit if no frames
cbz w2, .Lrms_zero

// Initialize accumulator vector to zero
movi v16.4s, #0 // v16 = accumulator for vector sum
fmov s17, wzr // s17 = accumulator for scalar sum

// Calculate how many complete NEON vectors (4 samples) we can process
lsr w3, w2, #2 // w3 = num_frames / 4 (complete vectors)
and w4, w2, #3 // w4 = num_frames % 4 (remainder samples)

// Process complete 4-sample vectors with NEON
cbz w3, .Lrms_scalar // Skip if no complete vectors

.Lrms_vector_loop:
// Load 4 samples from each buffer
ld1 {v0.4s}, [x0], #16 // v0 = L[i..i+3], advance pointer
ld1 {v1.4s}, [x1], #16 // v1 = R[i..i+3], advance pointer

// Square the samples: L² and R²
fmul v2.4s, v0.4s, v0.4s // v2 = L[i]² for 4 samples
fmul v3.4s, v1.4s, v1.4s // v3 = R[i]² for 4 samples

// Add L² + R²
fadd v4.4s, v2.4s, v3.4s // v4 = L[i]² + R[i]² for 4 samples

// Accumulate in sum vector
fadd v16.4s, v16.4s, v4.4s // accumulate

// Loop control
subs w3, w3, #1
b.ne .Lrms_vector_loop

.Lrms_scalar:
// Handle remaining samples (0-3) with scalar operations
cbz w4, .Lrms_finalize

.Lrms_scalar_loop:
// Load single samples
ldr s0, [x0], #4 // s0 = L[i]
ldr s1, [x1], #4 // s1 = R[i]

// Square and add: L² + R²
fmul s2, s0, s0 // s2 = L[i]²
fmul s3, s1, s1 // s3 = R[i]²
fadd s4, s2, s3 // s4 = L[i]² + R[i]²

// Add to scalar accumulator
fadd s17, s17, s4 // accumulate scalar remainder

// Loop control
subs w4, w4, #1
b.ne .Lrms_scalar_loop

.Lrms_finalize:
// Horizontal sum of accumulator vector v16 → s0
faddp v18.4s, v16.4s, v16.4s // pairwise add: [a+b, c+d, a+b, c+d]
faddp s0, v18.2s // final vector sum: (a+b) + (c+d)

// Add scalar accumulator to vector sum
fadd s0, s0, s17 // total_sum = vector_sum + scalar_sum

// Convert num_frames to float and multiply by 2
ucvtf s1, w2 // s1 = (float)num_frames
fmov s2, #2.0 // s2 = 2.0
fmul s1, s1, s2 // s1 = num_frames * 2

// Divide sum by (num_frames * 2) to get mean
fdiv s0, s0, s1 // s0 = mean = sum / (num_frames * 2)

// Take square root to get RMS
fsqrt s0, s0 // s0 = sqrt(mean) = RMS

ret

.Lrms_zero:
// Return 0.0 if no frames
fmov s0, wzr
ret

/*
* generator_clear_buffers_asm - NEON vectorized buffer clearing
* -----------------------------------------------------------
* void generator_clear_buffers_asm(float *Ld, float *Rd, float *Ls, float *Rs, uint32_t num_frames);
*
* Clears (zeros) all 4 float buffers using NEON vector stores.
* Replaces 4 memset() calls with optimized NEON operations.
*
* Uses NEON to process 4 samples per iteration for maximum throughput.
*/

.globl _generator_clear_buffers_asm

_generator_clear_buffers_asm:
// Arguments: x0=Ld, x1=Rd, x2=Ls, x3=Rs, w4=num_frames

// Early exit if no frames to process
cbz w4, .Lclear_done

// Initialize zero vector for NEON stores
movi v0.4s, #0 // v0 = [0.0, 0.0, 0.0, 0.0]

// Calculate how many complete NEON vectors (4 samples) we can process
lsr w5, w4, #2 // w5 = num_frames / 4 (complete vectors)
and w6, w4, #3 // w6 = num_frames % 4 (remainder samples)

// Process complete 4-sample vectors with NEON
cbz w5, .Lclear_scalar // Skip if no complete vectors

.Lclear_vector_loop:
// Store 4 zero samples to each buffer
st1 {v0.4s}, [x0], #16 // Ld[i..i+3] = 0.0, advance pointer
st1 {v0.4s}, [x1], #16 // Rd[i..i+3] = 0.0, advance pointer
st1 {v0.4s}, [x2], #16 // Ls[i..i+3] = 0.0, advance pointer
st1 {v0.4s}, [x3], #16 // Rs[i..i+3] = 0.0, advance pointer

// Loop control
subs w5, w5, #1
b.ne .Lclear_vector_loop

.Lclear_scalar:
// Handle remaining samples (0-3) with scalar operations
cbz w6, .Lclear_done

// Zero value for scalar stores
fmov s1, wzr // s1 = 0.0

.Lclear_scalar_loop:
// Store single zero sample to each buffer
str s1, [x0], #4 // Ld[i] = 0.0
str s1, [x1], #4 // Rd[i] = 0.0
str s1, [x2], #4 // Ls[i] = 0.0
str s1, [x3], #4 // Rs[i] = 0.0

// Loop control
subs w6, w6, #1
b.ne .Lclear_scalar_loop

.Lclear_done:
ret

/*
* generator_rotate_pattern_asm - NEON vectorized pattern rotation
* -------------------------------------------------------------
* void generator_rotate_pattern_asm(uint8_t *pattern, uint8_t *tmp, uint32_t size, uint32_t rot);
*
* Rotates a uint8_t array by 'rot' positions: pattern[i] = old_pattern[(i+rot) % size]
* Optimized for size=16 (STEPS_PER_BAR) using NEON EXT instruction.
*
* For size=16: Single NEON register holds entire pattern, EXT performs rotation in one operation.
*/

.globl _generator_rotate_pattern_asm

_generator_rotate_pattern_asm:
// Arguments: x0=pattern, x1=tmp, w2=size, w3=rot

// Early exit if no rotation needed
cbz w3, .Lrot_done

// Optimize for STEPS_PER_BAR = 16 case
cmp w2, #16
b.eq .Lrot_neon16

// Fallback for other sizes: scalar implementation
b .Lrot_scalar

.Lrot_neon16:
// NEON optimization for 16-byte patterns (STEPS_PER_BAR)
// Load entire 16-byte pattern into single NEON register
ld1 {v0.16b}, [x0]

// Build rotation index table: [rot, rot+1, rot+2, ..., rot+15] % 16
and w3, w3, #15 // Ensure rot is 0-15 (rot % 16)

// Build index table on stack
sub sp, sp, #16 // Allocate 16 bytes on stack
mov w4, wzr // w4 = loop counter

.Lrot_build_indices:
add w5, w4, w3 // w5 = i + rot
and w5, w5, #15 // w5 = (i + rot) % 16
strb w5, [sp, w4, uxtw] // Store index to stack
add w4, w4, #1 // i++
cmp w4, #16
b.lt .Lrot_build_indices

// Load index table and use TBL for rotation
ld1 {v2.16b}, [sp] // Load index table
tbl v1.16b, {v0.16b}, v2.16b // Perform table lookup rotation

// Store rotated pattern back and clean up stack
st1 {v1.16b}, [x0]
add sp, sp, #16 /
at txn 0x365a9d8a78918ff1c5633072be239db1053a2f5cd68a68ea419cabebdecfc5ac Aug-20-2025 12:42:23 AM UTC (28 days ago)
||NOTDEAFBEEF_MASTER_START||
NotDeafBeef On-Chain Assembly Audio-Visual Generator
=====================================================

This bundle contains the complete source code for generating
audio-visual NFTs from ARM64 assembly. The code is deterministic:
same seed = identical output.

RECONSTRUCTION INSTRUCTIONS:
1. Download all chunks from blockchain transactions
2. Save each as chunk_00.txt, chunk_01.txt, ..., chunk_XX.txt
3. Extract Python script: grep -A 90 "||FILE:extract.py||" chunk_26.txt | sed '1d' | sed '/||ENDFILE||/,$d' > extract.py
4. Run: python3 extract.py
5. Edit seed.s with your token's 32-byte seed
6. Run: ./build.sh
7. Output: nft_audio.wav + nft_final.mp4

TOTAL FILES: 128

||FILE:Makefile||
# NotDeafbeef - Root Build System
# Orchestrates builds for both C and Assembly implementations

# Default target builds the stable configuration
all: c-build

# Build C implementation (stable)
c-build:
$(MAKE) -C src/c

# Export timeline JSON for a given seed (usage: make export_timeline SEED=0xDEADBEEF OUT=path.json)
export_timeline:
$(MAKE) -C src/c bin/export_timeline
cd src/c && ./bin/export_timeline $(SEED) $(OUT)

# Visual assembly object files
visual_core.o: asm/visual/visual_core.s
gcc -c asm/visual/visual_core.s -o visual_core.o

drawing.o: asm/visual/drawing.s
gcc -c asm/visual/drawing.s -o drawing.o

ascii_renderer.o: asm/visual/ascii_renderer.s
gcc -c asm/visual/ascii_renderer.s -o ascii_renderer.o

particles.o: asm/visual/particles.s
gcc -c asm/visual/particles.s -o particles.o

bass_hits.o: asm/visual/bass_hits.s
gcc -c asm/visual/bass_hits.s -o bass_hits.o

terrain.o: asm/visual/terrain.s
gcc -c asm/visual/terrain.s -o terrain.o

glitch_system.o: asm/visual/glitch_system.s
gcc -c asm/visual/glitch_system.s -o glitch_system.o

# Build visual system with ASM components
vis-build: visual_core.o drawing.o ascii_renderer.o particles.o bass_hits.o terrain.o glitch_system.o
mkdir -p bin
gcc -o bin/vis_main src/vis_main.c src/visual_c_stubs.c src/audio_visual_bridge.c src/wav_reader.c visual_core.o drawing.o ascii_renderer.o particles.o bass_hits.o terrain.o glitch_system.o -Iinclude $(shell pkg-config --cflags --libs sdl2) -lm

# Frame generator (no SDL2 required)
generate_frames: visual_core.o drawing.o ascii_renderer.o particles.o bass_hits.o terrain.o glitch_system.o
gcc -o generate_frames generate_frames.c src/audio_visual_bridge.c src/deterministic_prng.c src/timeline_reader.c simple_wav_reader.c visual_core.o drawing.o ascii_renderer.o particles.o bass_hits.o terrain.o glitch_system.o -Iinclude -Isrc/include -lm

# Build audio system only (for protection verification)
audio:
$(MAKE) -C src/c segment USE_ASM=1 VOICE_ASM="GENERATOR_ASM KICK_ASM SNARE_ASM HAT_ASM MELODY_ASM LIMITER_ASM"

# Generate test audio files
test-audio:
python3 tools/generate_test_wavs.py

# NEW: Generate comprehensive WAV tests for all sounds in both C and ASM
test-comprehensive:
python3 tools/generate_comprehensive_tests.py

# NEW: Compare C vs ASM WAV files
compare:
python3 tools/compare_c_vs_asm.py

# NEW: Play specific sound for audition (usage: make play SOUND=kick)
play:
ifndef SOUND
@echo "Usage: make play SOUND=<sound_name>"
@echo "Example: make play SOUND=kick"
else
python3 tools/compare_c_vs_asm.py --play $(SOUND)
endif

# Run test suite
test:
pytest tests/

# Clean all build artifacts
clean:
$(MAKE) -C src/c clean
rm -rf output/
find . -name "*.o" -delete
find . -name "*.dSYM" -delete
rm -f generate_frames 2>/dev/null || true

# Generate a demo audio segment
demo:
$(MAKE) -C src/c segment
@echo "Generated demo audio: src/c/seed_0xcafebabe.wav"

# Quick verification that everything works
verify: c-build test-audio
@echo "✅ NotDeafbeef verification complete!"

# NEW: Full verification including comprehensive tests
verify-full: c-build test-comprehensive compare
@echo "✅ NotDeafbeef full verification complete!"
@echo "Check the comparison output above for any issues."

.PHONY: all c-build vis-build audio test-audio test-comprehensive compare play test clean demo verify verify-full
||ENDFILE||
||FILE:asm/active/delay.s||
.text
.align 2
.globl _delay_process_block

// -----------------------------------------------------------------------------
// void delay_process_block(delay_t *d, float *L, float *R, uint32_t n, float feedback)
// x0 = delay_t* { float *buf; uint32_t size; uint32_t idx; }
// x1 = L buffer
// x2 = R buffer
// w3 = n samples
// s0 = feedback amount
// Stereo ping-pong delay: L feeds R, R feeds L
// -----------------------------------------------------------------------------
_delay_process_block:
// Prologue – use 512-byte frame; save x27/x28 at offset #480 (512-64) giving
// the maximum distance LLDP allows (>=480 <=504) to avoid overlapping caller
// memory even in fast execution.
stp x29, x30, [sp, #-512]!
stp q8, q9, [sp, #112]
stp q10, q11, [sp, #144]
stp q12, q13, [sp, #176]
stp q14, q15, [sp, #208]
mov x29, sp
stp x19, x20, [sp, #16]
stp x21, x22, [sp, #32]
stp x23, x24, [sp, #48]
stp x25, x26, [sp, #64]
stp x27, x28, [sp, #480]

// Load struct members (buf,size,idx) into convenient regs
ldr x4, [x0] // buf*
ldr w5, [x0, #8] // size
ldr w6, [x0, #12] // idx

// Early-out if n==0
cbz w3, Ldone

// --- PRE-WRAP BUG FIX ----------------------------------------------------
// Make absolutely sure idx is in range BEFORE first buffer access.
cmp w6, w5 // idx >= size ?
csel w6, wzr, w6, hs// if so wrap to 0
// ------------------------------------------------------------------------

mov w7, wzr // loop counter i

Lloop:
// Break conditions
cmp w7, w3
b.hs Lstore_idx // i >= n → exit loop

// Calculate &buf[idx*2]
// (each sample is float = 4 bytes, stereo interleaved)
// addr = buf + idx*8
lsl w8, w6, #3 // w8 = idx*8
add x9, x4, x8 // x9 = &buf[idx*2]

// Load delayed samples
ldp s1, s2, [x9] // s1 = yl, s2 = yr

// Load input samples (post-increment L/R ptrs)
ldr s3, [x1] // L[i]
ldr s4, [x2] // R[i]

// buf[idx*2] = L + yr*feedback
fmadd s5, s2, s0, s3
// buf[idx*2+1] = R + yl*feedback
fmadd s6, s1, s0, s4
stp s5, s6, [x9]

// Add delayed signal to dry samples
fadd s3, s3, s1 // L[i] = dryL + yl
fadd s4, s4, s2 // R[i] = dryR + yr
str s3, [x1], #4 // write & advance L*
str s4, [x2], #4 // write & advance R*

// Increment and wrap idx
add w6, w6, #1
cmp w6, w5
csel w6[CHUNK 2 OF 2]
BUNDLE_0_CORE - PART 2
Concatenate all chunks in order to reconstruct.

, wzr, w6, hs

// Next sample
add w7, w7, #1
b Lloop

Lstore_idx:
// Store updated idx back to struct
str w6, [x0, #12]

Ldone:
// Epilogue – mirror prologue order
ldp x27, x28, [sp, #480]
ldp q14, q15, [sp, #208]
ldp q12, q13, [sp, #176]
ldp q10, q11, [sp, #144]
ldp q8, q9, [sp, #112]
ldp x29, x30, [sp]
add sp, sp, #512
ret||ENDFILE||
||FILE:asm/active/euclid.s||
.text
.align 2
.globl _euclid_pattern
_euclid_pattern: // void euclid_pattern(int pulses,int steps,uint8_t *out)
stp x29, x30, [sp, #-16]! // prologue
mov x29, sp

mov w3, wzr // bucket = 0
mov w4, wzr // i = 0 (loop counter)

1: cmp w4, w1 // while (i < steps)
b.ge 2f

add w3, w3, w0 // bucket += pulses
cmp w3, w1
b.lt 3f
sub w3, w3, w1 // bucket -= steps
mov w5, #1 // out[i] = 1
b 4f
3:
mov w5, #0 // out[i] = 0
4: add x6, x2, x4 // &out[i]
strb w5, [x6]

add w4, w4, #1 // i++
b 1b

2: ldp x29, x30, [sp], #16 // epilogue
ret ||ENDFILE||
||FILE:asm/active/exp4_ps_asm.s||
.text
.align 2
.globl _exp4_ps_asm

// float32x4_t exp4_ps_asm(float32x4_t x)
// 4-wide single-precision e^x approximation, ported from fast_math_neon.h.
// Identical maths to the C intrinsics version.

_exp4_ps_asm:
// v0 holds input vector x and will carry the final result.

// Load constant table base
adrp x9, Lexp_const@PAGE
add x9, x9, Lexp_const@PAGEOFF

// Clamp x to [min_x, max_x]
ldr q1, [x9, #0] // max_x
ldr q2, [x9, #16] // min_x
fmin v3.4s, v0.4s, v1.4s // v3 = min(x,max_x)
fmax v0.4s, v3.4s, v2.4s // x = max(v3,min_x)

// fx = x * log2e + 0.5
ldr q4, [x9, #32] // log2e
fmul v5.4s, v0.4s, v4.4s
ldr q6, [x9, #48] // 0.5
fadd v5.4s, v5.4s, v6.4s

// Convert to int (truncate toward zero)
fcvtzs v7.4s, v5.4s // emm0

// fx = float(emm0)
scvtf v8.4s, v7.4s

// x -= fx * ln2_hi + fx * ln2_lo
ldr q9, [x9, #64] // ln2_hi
ldr q10, [x9, #80] // ln2_lo
fmul v11.4s, v8.4s, v9.4s
fmul v12.4s, v8.4s, v10.4s
fsub v0.4s, v0.4s, v11.4s
fsub v0.4s, v0.4s, v12.4s

// Polynomial approximation
fmul v13.4s, v0.4s, v0.4s // x2 = x*x

ldr q14, [x9, #96] // c1
ldr q15, [x9, #112] // c2
fmla v14.4s, v0.4s, v15.4s

ldr q16, [x9, #128] // c3
fmla v14.4s, v13.4s, v16.4s

fmul v17.4s, v13.4s, v0.4s // x2*x
ldr q18, [x9, #144] // c4
fmla v14.4s, v17.4s, v18.4s

fmul v19.4s, v13.4s, v13.4s // x2*x2
ldr q20, [x9, #160] // c5
fmla v14.4s, v19.4s, v20.4s

// y += x
fadd v14.4s, v14.4s, v0.4s

// y += 1.0f
ldr q21, [x9, #176] // 1.0f vector
fadd v14.4s, v14.4s, v21.4s

// construct 2^n
movi v22.4s, #127 // 127
add v7.4s, v7.4s, v22.4s
shl v7.4s, v7.4s, #23
mov v23.16b, v7.16b

// result = y * 2^n
fmul v0.4s, v14.4s, v23.4s
ret

.align 4
Lexp_const:
// max_x
.float 88.3762626647949, 88.3762626647949, 88.3762626647949, 88.3762626647949
// min_x
.float -88.3762626647949, -88.3762626647949, -88.3762626647949, -88.3762626647949
// log2e
.float 1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341
// 0.5
.float 0.5, 0.5, 0.5, 0.5
// ln2_hi
.float 0.693359375, 0.693359375, 0.693359375, 0.693359375
// ln2_lo
.float -2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4
// c1
.float 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4
// c2
.float 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3
// c3
.float 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3
// c4
.float 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2
// c5
.float 0.16666665459, 0.16666665459, 0.16666665459, 0.16666665459
// 1.0
.float 1.0, 1.0, 1.0, 1.0 ||ENDFILE||
||FILE:asm/active/fm_voice.s||
.section __TEXT,__const
.align 2
.L_tau:
.float 6.283185307
.L_pi:
.float 3.14159265
.L_six:
.float 6.0
.L_onehundredtwenty:
.float 120.0

.text
.align 2
.globl _fm_voice_process

// fm_voice_process(fm_voice_t *v, float32_t *L, float32_t *R, uint32_t n)
// Struct offsets: sr=0, carrier_freq=4, ratio=8, index0=12, amp=16, decay=20
// len=24, pos=28, carrier_phase=32, mod_phase=36
_fm_voice_process:
// x0 = fm_voice_t *v, x1 = L, x2 = R, x3 = n

// Early exit if pos >= len
ldr w4, [x0, #28] // v->pos
ldr w5, [x0, #24] // v->len
cmp w4, w5
b.ge .fm_exit

// Load parameters
ldr s16, [x0, #0] // v->sr
ldr s17, [x0, #4] // v->carrier_freq
ldr s18, [x0, #8] // v->ratio
ldr s19, [x0, #12] // v->index0
ldr s20, [x0, #16] // v->amp
ldr s21, [x0, #20] // v->decay
ldr s22, [x0, #32] // v->carrier_phase
ldr s23, [x0, #36] // v->mod_phase

// Calculate increments: c_inc = TAU * carrier_freq / sr
adrp x7, .L_tau@PAGE
add x7, x7, .L_tau@PAGEOFF
ldr s0, [x7] // Load TAU
fmul s24, s0, s17 // TAU * carrier_freq
fdiv s24, s24, s16 // c_inc = TAU * carrier_freq / sr

// m_inc = TAU * carrier_freq * ratio / sr
fmul s25, s24, s18 // m_inc = c_inc * ratio

mov w6, #0 // i = 0

.fm_loop:
cmp w6, w3 // i < n?
b.ge .fm_loop_end

// Check if pos >= len
ldr w4, [x0, #28] // v->pos
ldr w5, [x0, #24] // v->len
cmp w4, w5
b.ge .fm_loop_end

// Calculate envelope: t = pos / sr
ucvtf s26, w4 // convert pos to float
fdiv s26, s26, s16 // t = pos / sr

// Simple exponential decay approximation: env ≈ 1.0 / (1.0 + decay * t)
fmul s27, s21, s26 // decay * t
fmov s28, #1.0
fadd s27, s28, s27 // 1.0 + decay * t
fdiv s27, s28, s27 // env = 1.0 / (1.0 + decay * t)

// Index with envelope: index = index0 * env
fmul s29, s19, s27 // index = index0 * env

// FM synthesis: sin(carrier_phase + index * sin(mod_phase))
// Use polynomial approximation for sine waves

// Step 1: Calculate sin(mod_phase) using polynomial approximation
// Normalize mod_phase to [-π, π] range
adrp x7, .L_pi@PAGE
add x7, x7, .L_pi@PAGEOFF
ldr s30, [x7] // Load PI

// Wrap mod_phase to [-π, π]
fmov s31, s23 // s31 = mod_phase
fcmp s31, s30 // compare with π
b.le .fm_mod_no_wrap_pos
fsub s31, s31, s30 // mod_phase - π
fsub s31, s31, s30 // mod_phase - 2π
.fm_mod_no_wrap_pos:
fneg s0, s30 // -π
fcmp s31, s0 // compare with -π
b.ge .fm_mod_wrapped
fadd s31, s31, s30 // mod_phase + π
fadd s31, s31, s30 // mod_phase + 2π
.fm_mod_wrapped:

// Use polynomial sine approximation instead of libm for stability
// sin(x) ≈ x - x³/6 + x⁵/120 (higher order for better accuracy)
fmul s0, s31, s31 // x²
fmul s1, s0, s31 // x³
fmul s2, s0, s0 // x⁴
fmul s3, s2, s31 // x⁵

// Calculate x³/6
adrp x7, .L_six@PAGE
add x7, x7, .L_six@PAGEOFF
ldr s4, [x7]
fdiv s1, s1, s4 // x³/6

// Calculate x⁵/120
adrp x7, .L_onehundredtwenty@PAGE
add x7, x7, .L_onehundredtwenty@PAGEOFF
ldr s4, [x7]
fdiv s3, s3, s4 // x⁵/120

// Combine: x - x³/6 + x⁵/120
fsub s31, s31, s1 // x - x³/6
fadd s31, s31, s3 // x - x³/6 + x⁵/120

// Step 2: Apply modulation index: index * sin(mod_phase)
fmul s31, s29, s31 // index * sin(mod_phase)

// Clamp modulation to prevent instability: limit to [-3.0, 3.0]
fmov s0, #3.0
fcmp s31, s0
b.le .fm_mod_clamp_pos_ok
fmov s31, s0 // clamp to +3.0
.fm_mod_clamp_pos_ok:
fneg s0, s0 // -3.0
fcmp s31, s0
b.ge .fm_mod_clamp_neg_ok
fmov s31, s0 // clamp to -3.0
.fm_mod_clamp_neg_ok:

// Step 3: Add to carrier phase: carrier_phase + index * sin(mod_phase)
fadd s31, s22, s31 // carrier_phase + index * sin(mod_phase)

// Step 4: Calculate final sine: sin(carrier_phase + index * sin(mod_phase))
// Wrap result to [-π, π] range
fcmp s31, s30 // compare with π
b.le .fm_carr_no_wrap_pos
fsub s31, s31, s30 // result - π
fsub s31, s31, s30 // result - 2π
.fm_carr_no_wrap_pos:
fneg s0, s30 // -π
fcmp s31, s0 // compare with -π
b.ge .fm_carr_wrapped
fadd s31, s31, s30 // result + π
fadd s31, s31, s30 // result + 2π
.fm_carr_wrapped:

// Use higher-order polynomial sine approximation
// sin(x) ≈ x - x³/6 + x⁵/120 (better accuracy than simple version)
fmul s0, s31, s31 // x²
fmul s1, s0, s31 // x³
fmul s2, s0, s0 // x⁴
fmul s3, s2, s31 // x⁵

// Calculate x³/6
adrp x7, .L_six@PAGE
add x7, x7, .L_six@PAGEOFF
ldr s4, [x7]
fdiv s1, s1, s4 // x³/6

// Calculate x⁵/120
adrp x7, .L_onehundredtwenty@PAGE
add x7, x7, .L_onehundredtwenty@PAGEOFF
ldr s4, [x7]
fdiv s3, s3, s4 // x⁵/120

// Combine: x - x³/6 + x⁵/120
fsub s31, s31, s1 // x - x³/6
fadd s31, s31, s3 // x - x³/6 + x⁵/120
// s31 now contains the final FM synthesis result

// Apply envelope and amplitude (with scaling to prevent clipping)
fmul s31, s31, s27 // apply envelope
fmul s31, s31, s20 // apply amplitude
fmov s0, #0.25 // Scale down to prevent clipping from FM harmonics
fmul s31, s31, s0 // final scaling

// Final safety clamp to prevent amplitude spikes: limit to [-1.0, 1.0]
fmov s0, #1.0
fcmp s31, s0
b.le .fm_out_clamp_pos_ok
fmov s31, s0 // clamp to +1.0
.fm_out_clamp_pos_ok:
fneg s0, s0 // -1.0
fcmp s31, s0
b.ge .fm_out_clamp_neg_ok
fmov s31, s0 // clamp to -1.0
.fm_out_clamp_neg_ok:

// Add to output buffers
ldr s0, [x1, x6, lsl #2] // L[i]
fadd s0, s0, s31 // L[i] += sample
str s0, [x1, x6, lsl #2] // store L[i]

ldr s0, [x2, x6, lsl #2] // R[i]
fadd s0, s0, s31 // R[i] += sample
str s0, [x2, x6, lsl #2] // store R[i]

// Update phases
fadd s22, s22, s24 // carrier_phase += c_inc
fadd s23, s23, s25 // mod_phase += m_inc

// Keep phases in range [0, TAU] with proper modulo wrapping
adrp x7, .L_tau@PAGE
add x7, x7, .L_tau@PAGEOFF
ldr s0, [x7] // Load TAU

// Carrier phase proper modulo: phase = phase - TAU * round(phase / TAU)
fdiv s1, s22, s0 // s1 = carrier_phase / TAU
frinta s1, s1 // s1 = round(carrier_phase / TAU)
fmul s1, s1, s0 // s1 = round(carrier_phase / TAU) * TAU
fsub s22, s22, s1 // carrier_phase = carrier_phase - round_part

// Modulator phase proper modulo: phase = phase - TAU * round(phase / TAU)
fdiv s2, s23, s0 // s2 = mod_phase / TAU
frinta s2, s2 // s2 = round(mod_phase / TAU)
fmul s2, s2, s0 // s2 = round(mod_phase / TAU) * TAU
fsub s23, s23, s2 // mod_phase = mod_phase - round_part

// Increment counters
add w6, w6, #1 // i++
add w4, w4, #1 // pos++
str w4, [x0, #28] // store v->pos

b .fm_loop

.fm_loop_end:
// Store updated phases
str s22, [x0, #32] // v->carrier_phase
str s23, [x0, #36] // v->mod_phase

.fm_exit:
ret
||ENDFILE||
at txn 0xc258b892b2f63f0ba38ce4aee3d40c380eb273d3b197675d3927e2888e9182c0 Aug-20-2025 12:40:47 AM UTC (28 days ago)
=:THOR.RUJI:thor1tmpduvq480lu235uns2s589dqzdpafgs5dj2mf:0/1/0:va:0
at txn 0x58b688660bdeb3757518357fa131bf7b56e9741a2860978191199723c250a4c4 Aug-20-2025 12:31:35 AM UTC (28 days ago)
=:THOR.RUJI:thor1tmpduvq480lu235uns2s589dqzdpafgs5dj2mf:0/1/0:va:0
at txn 0x01efbd79b1860ce28fd070b91dd43d16da71d27b8f5cda4c574a0506f7a16d70 Aug-20-2025 12:27:23 AM UTC (28 days ago)
=:b:3FMKaWhgLGkMKyLseuPukoZtFHr3tHarug:0/1/0:zengo:200
at txn 0x03fbfbca259817354e845052ea1e81c702efadb78049afd7b274a8de5217d331 Aug-20-2025 12:26:23 AM UTC (28 days ago)
DC-L5:A71w7GgJrJ47uldVcjooTxUHOIrQ/Z9ygFcKO73zZB0=
at txn 0xe3a13a53d31451f66c9c02ca34a30e8ec7835af66b6df2824edadc9597b7113f Aug-20-2025 12:10:59 AM UTC (28 days ago)
Verification: Requesting verification for 0x4D51573Db98693561B2c97722aEecADA267A9345 with no branding text on etherscan
at txn 0x91c4ef8cee438aa89d6bc8a36739a6d7656cd5344ff1554a7ba49f6609a63d4d Aug-20-2025 12:01:23 AM UTC (28 days ago)

{"BlockHash":"0x6060edb39d98e5ea02583d79074bf873fc9e810354ee8c51693392a68ce3ae96","Sender":"5GCc8penY3wGsmaq8ZgeTW7TgEfN76tZGvAnUZ2ZqmMbPWqj","Nonce":3891,"Commitment":"0x841c4dbb154e556776700ffbb61891cd44d8310ae76494ae4e774788dd2e7122"}
at txn 0xd24b5f4acc875659897e9c07696554fd66b8bc503ee8bde0ad04292f501ec32a Aug-19-2025 11:55:11 PM UTC (28 days ago)
------BEGIN MEMO------eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJub25jZSI6IjllZDAyN2YzLThhZGQtNDY5NC04MTI4LWQzNTNlMTRkMTUyNSIsIm9yZGVySWRlbnRpZmllciI6ImI5ZTQ5NDlkLTNhYzQtNDg3Mi1hYTU2LWNmZGJlN2RhNDBjNyIsImlhdCI6MTc1NTY0NjY5Mn0.utqlnjY-17TKOavHMmIzpH0QQvr2nVcyILFSzk-65o8------END MEMO------
at txn 0x439c86f66c8a5dd4acb692e5e92c4ad118a79688e0b69bc2df53962459168b13 Aug-19-2025 11:38:59 PM UTC (28 days ago)
------BEGIN MEMO------eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJub25jZSI6IjEzZTY0Njc3LTkzNTctNDEwYy05Y2I5LWQ4ODhhYWQ4NTA0MiIsIm9yZGVySWRlbnRpZmllciI6ImJjZDFkZTQyLTk3MzUtNDVkMS04NTNmLTc0ODZjMzAyYzMwMSIsImlhdCI6MTc1NTY0NTM2Nn0.7gFODtJ7z8kmexUgsriU-gwzvG6twhI-UYIl59dS87E------END MEMO------
at txn 0xa3a668847655eaaea03641221f54ae970c959463b77357184f0f2852c5cfdb60 Aug-19-2025 11:16:23 PM UTC (28 days ago)
Show messages: