|
Post by DarioG on Aug 4, 2021 21:28:41 GMT
asm volatile ("mov #_readPointer,w0" : : : "w0","w1","w2","w3","w4","w5","w6","w7","w8","w9","w10","w11","w12","w13"); // per clobbering registers https://www.eevblog.com/forum/microcontrollers/inserting-some-assembly-inside-c-code-for-pic24fj-series/ asm volatile ("mov [w0],w8"); // W8 = p asm volatile ("mov #_win,w10"); // W10 = pw asm volatile ("movsac A,[w8]+=2,w4,[w10]+=2,w5"); // W4 = *p, W5 = *pw asm volatile ("do #127,.myenddo"); asm volatile ("mpy w4*w5,B,[w8]+=2,w4,[w10]+=2,w5"); asm volatile ("mov #_c,w9"); // W9 = &c // USARE cw e shiftare al volo?? asm volatile ("mov #_z1,w11"); // W11 = &z1 asm volatile ("mov #_z2,w12"); // W12 = &z2 asm volatile ("movsac A,[w9]+=2,w6,[w11],w7"); // W6 = c[i], W7 = sw[i] // non va! ah perché w11 è accettato solo in ymemory e lo prende così... asm volatile ("do #2,.myendfor"); //MAX_SLOTS asm volatile ("clr A"); asm volatile ("ADD A"); asm volatile ("mac w6*w7,A,[w9]+=2,w6"); asm volatile ("sac A,#-5,w0"); // W0=z0 asm volatile ("sub w0,[w12],w0"); asm volatile ("mov [w11],[w12++]"); //z2[i] = z1[i]; asm volatile ("mov w0,[w11++]"); //z1[i] = z0; asm volatile ("mov [w11],w7"); asm volatile (".myendfor: nop"); asm volatile ("mov #_opBuffWindow+160*2,w0"); asm volatile ("cp w8,w0"); asm volatile ("BRA NZ, skip1"); asm volatile ("mov #_opBuffWindow,w8"); // W8 = p asm volatile ("movsac A,[w8]+=2,w4"); // W4 = *p asm volatile ("skip1:"); asm volatile (".myenddo: nop"); asm volatile ("mov #_readPointer,w1"); asm volatile ("mov [w1],w0"); asm volatile ("add #32*2,w0"); //SUB_BLOCK_LENGTH asm volatile ("mov #_opBuffWindow+160*2,w2"); asm volatile ("cp w0,w2"); // CP literal accetta max 32... asm volatile ("BRA NZ, skip2"); asm volatile ("mov #_opBuffWindow,w0"); asm volatile ("mov w0,[w1]"); asm volatile ("BRA skip2_"); asm volatile ("skip2:"); asm volatile ("mov w0,[w1]"); asm volatile ("skip2_:"); asm volatile ("mov #_cw,w8"); // W8 = &cw asm volatile ("mov #_sw,w9"); // W9 = &sw asm volatile ("mov #_z1,w10"); // W10 = &z1 asm volatile ("mov #_z2,w11"); // W11 = &z2 asm volatile ("mov #_I,w12"); // W12 = &I asm volatile ("mov #_Q,w13"); // W13 = &Q asm volatile ("mov #_fftData,w7"); // W7 = &fftData asm volatile ("movsac A,[w8]+=2,w4,[w10]+=2,w6"); // W4 = cw[i],W6 = z1[i] asm volatile ("mov [w9++],w5"); // W5 = sw[i] asm volatile ("do #2,.myendfor2"); //MAX_SLOTS asm volatile ("mpy w4*w6,A,[w8]+=2,w4"); asm volatile ("lac [w11++],#5,B"); // z2[i] asm volatile ("SUB A"); // forse usare MSC e LAC.. no, può solo sottrarre da acc... asm volatile ("sac A,#-5,w0"); // W12=I[i] asm volatile ("mpy w5*w6,B,[w9]+=2,w5,[w10]+=2,w6"); asm volatile ("sac B,#-5,w1"); // W13=Q[i] // CORCONbits.US=0b01; // unsigned multiply, SERVE QUA?? e poi risistemare per sopra // DEVO mettere w12 e 13 in 4 e 5 per la mpy... però così poi devo ricaricarli e perdo il postinc di sopra asm volatile ("exch w4,w0"); asm volatile ("exch w5,w1"); asm volatile ("mpy w4*w4,A"); asm volatile ("mac w5*w5,A"); asm volatile ("sac A,#1,[w7++]"); // W1=fftData[i] asm volatile ("exch w4,w0"); asm volatile ("exch w5,w1"); asm volatile ("mov w0,[w12++]"); asm volatile ("mov w1,[w13++]"); asm volatile (".myendfor2: nop");
...further details coming soon
|
|
|
Post by DarioG on Aug 4, 2021 21:42:06 GMT
~80uS on 128 samples (including Hamming windowing) and 3 frequency outputs. @140mhz CPU clock of course.
|
|
|
Post by DarioG on Aug 5, 2021 10:40:49 GMT
Original code comes from ST DT0089 Application Note for detecting DTMF tones. Using standard 32bit ints it was performing in ~200uS in the same conditions as above:
int win[MAXN]; // Window // Goertzel int c[8], cw[8], sw[8]; // Goertzel constants int z1[8], z2[8]; // Goertzel status registers int I[8], Q[8], M2[8]; // Goertzel output: real, imag, squared magnitude
for(i=0; i<N; i++) { // init window (Hamming) win[i] = (int)round(S*(0.54 - 0.46 * cosf(2.0*M_PI*(float)i /( float)(N-1)))); }
//December 2017 DT0089 Rev 1 8/10 //www.st.com for(i=0; i<4; i++) { // init Goertzel constants // CORDIC may be used here to compute sin() and cos() w = 2.0*M_PI*round((float)N * (float)frow[i]/(float)Fs) / (float)N; cw[i] = (int)round(S*cosf(w)); c[i] = cw[i]<<1; sw[i] = (int)round(S*sinf(w)); w = 2.0*M_PI*round((float)N * (float)fcol[i]/(float)Fs) / (float)N; cw[i+4] = (int)round(S*cosf(w)); c[i+4] = cw[i+4] << 1; sw[i+4] = (int)round(S*sinf(w)); }
for(n=0; !feof(f);) { i = fscanf(f,"%d",&x); if(i<1) continue; x = ((x * win[n]) >> b); // windowing if((n % N) == 0) for(i=0; i<8; i++) { z1[i]=0; z2[i]=0; } // Goertzel reset
// **** GOERTZEL ITERATION **** for(i=0; i<8; i++) { z0 = x + ((c[i] * z1[i]) >> b) - z2[i]; // Goertzel iteration z2[i] = z1[i]; z1[i] = z0; } // Goertzel status update
// **** GOERTZEL ITERATION **** n++; if((n % N) == 0) { n=0; // finalize and decode for(i1=i2=-1,i=0; i<8; i++) { // CORDIC may be used here to compute atan2() and sqrt() I[i] = ((cw[i] * z1[i]) >> b) - z2[i]; // Goertzel final I Q[i] = ((sw[i] * z1[i]) >> b); // Goertzel final Q M2[i] = I[i] * I[i] + Q[i] * Q[i]; // magnitude squared if(M2[i]>th) { // DTMF decoding
// omissis } } }
|
|
|
Post by DarioG on Aug 5, 2021 10:48:04 GMT
This is the surrounding C code for my Assembler code:
// variables #define Fs 40000L //40500 // per multiplo esatto 2025... 40000L #define FFT_BLOCK_LENGTH 128 // = Number of samples per iteration #define SUB_BLOCK_LENGTH (FFT_BLOCK_LENGTH/4) // mm con 8 non va... strano...
int16_t __attribute__ ((space(ymemory),far)) win[FFT_BLOCK_LENGTH]; // Window #define Q1_FACTOR 10 // // 6.10 #define SCALING_FACTOR Q1_FACTOR #define MAX_SLOTS 3 // 3 per ricevere o 6 per testare loopback... int16_t cw[MAX_SLOTS],sw[MAX_SLOTS]; // xmemory pare non serva... int16_t c[MAX_SLOTS]; // Goertzel constants int16_t __attribute__ ((space(ymemory),far)) z1[MAX_SLOTS], z2[MAX_SLOTS]; // Goertzel status registers int16_t I[MAX_SLOTS], Q[MAX_SLOTS]; // Goertzel output: real, imag, squared magnitude WORD fftData[MAX_SLOTS];
int16_t opBuffWindow[FFT_BLOCK_LENGTH+SUB_BLOCK_LENGTH];
//initialization #define round(n) (n) S = (float)(1 << Q1_FACTOR); // scaling factor for(i=0; i<FFT_BLOCK_LENGTH; i++) // init window (Hamming) win[i] = (int)round((S-1) *(0.54 - 0.46 * cosf(2.0*PI*(float)i /(float)(FFT_BLOCK_LENGTH-1))));
for(i=0; i<MAX_SLOTS; i++) { // init Goertzel constants // CORDIC may be used here to compute sin() and cos() w = 2.0*PI*round((float)FFT_BLOCK_LENGTH * (float)440/(float)Fs) / (float)FFT_BLOCK_LENGTH; // all'init, fisso su 440 tanto per :) cw[i] = (int)round((S-1)*cosf(w)); c[i] = cw[i] << 1; sw[i] = (int)round((S-1)*sinf(w)); }
WORD freq[3]; // int i; float w, S;
freq[1]=2025; // software modem frequencies freq[0]=2225; freq[2]=2125; S = (float)(1 << Q1_FACTOR); // scaling factor
for(i=0; i<MAX_SLOTS; i++) { // init Goertzel constants // CORDIC may be used here to compute sin() and cos() w = 2.0*PI*round((float)FFT_BLOCK_LENGTH * (float)freq[i]/(float)Fs) / (float)FFT_BLOCK_LENGTH; cw[i] = (int)round((S-1)*cosf(w)); c[i] = cw[i] << 1; sw[i] = (int)round((S-1)*sinf(w)); }
readPointer=opBuffWindow; // circular buffer read writePointer=opBuffWindow+FFT_BLOCK_LENGTH; // circular buffer write
//main code to be executed on 128 samples (circular buffer filled via IRQ) which means 800uSec as of now for(i=0; i<MAX_SLOTS; i++) z1[i]=z2[i]=0; // Goertzel reset asm volatile ("mov #_readPointer,w0" : : : "w0","w1","w2","w3","w4","w5","w6","w7","w8","w9","w10","w11","w12","w13"); // per clobbering registers https://www.eevblog.com/forum/microcontrollers/inserting-some-assembly-inside-c-code-for-pic24fj-series/ asm volatile ("mov [w0],w8"); // W8 = p asm volatile ("mov #_win,w10"); // W10 = pw asm volatile ("movsac A,[w8]+=2,w4,[w10]+=2,w5"); // W4 = *p, W5 = *pw asm volatile ("do #127,.myenddo"); asm volatile ("mpy w4*w5,B,[w8]+=2,w4,[w10]+=2,w5"); asm volatile ("mov #_c,w9"); // W9 = &c asm volatile ("mov #_z1,w11"); // W11 = &z1 asm volatile ("mov #_z2,w12"); // W12 = &z2 asm volatile ("movsac A,[w9]+=2,w6,[w11],w7"); // W6 = c[i], W7 = sw[i] asm volatile ("do #2,.myendfor"); //MAX_SLOTS asm volatile ("clr A"); asm volatile ("ADD A"); asm volatile ("mac w6*w7,A,[w9]+=2,w6"); asm volatile ("sac A,#-5,w0"); // W0=z0 asm volatile ("sub w0,[w12],w0"); asm volatile ("mov [w11],[w12++]"); //z2[i] = z1[i]; asm volatile ("mov w0,[w11++]"); //z1[i] = z0; asm volatile ("mov [w11],w7"); asm volatile (".myendfor: nop"); asm volatile ("mov #_opBuffWindow+160*2,w0"); asm volatile ("cp w8,w0"); asm volatile ("BRA NZ, skip1"); asm volatile ("mov #_opBuffWindow,w8"); // W8 = p asm volatile ("movsac A,[w8]+=2,w4"); // W4 = *p asm volatile ("skip1:"); asm volatile (".myenddo: nop"); asm volatile ("mov #_readPointer,w1"); asm volatile ("mov [w1],w0"); asm volatile ("add #32*2,w0"); //SUB_BLOCK_LENGTH asm volatile ("mov #_opBuffWindow+160*2,w2"); asm volatile ("cp w0,w2"); // CP literal accetta max 32... asm volatile ("BRA NZ, skip2"); asm volatile ("mov #_opBuffWindow,w0"); asm volatile ("mov w0,[w1]"); asm volatile ("BRA skip2_"); asm volatile ("skip2:"); asm volatile ("mov w0,[w1]"); asm volatile ("skip2_:"); asm volatile ("mov #_cw,w8"); // W8 = &cw asm volatile ("mov #_sw,w9"); // W9 = &sw asm volatile ("mov #_z1,w10"); // W10 = &z1 asm volatile ("mov #_z2,w11"); // W11 = &z2 asm volatile ("mov #_I,w12"); // W12 = &I asm volatile ("mov #_Q,w13"); // W13 = &Q asm volatile ("mov #_fftData,w7"); // W7 = &fftData asm volatile ("movsac A,[w8]+=2,w4,[w10]+=2,w6"); // W4 = cw[i],W6 = z1[i] asm volatile ("mov [w9++],w5"); // W5 = sw[i] asm volatile ("do #2,.myendfor2"); //MAX_SLOTS asm volatile ("mpy w4*w6,A,[w8]+=2,w4"); asm volatile ("lac [w11++],#5,B"); // z2[i] asm volatile ("SUB A"); // forse usare MSC e LAC.. no, può solo sottrarre da acc... asm volatile ("sac A,#-5,w0"); // W12=I[i] asm volatile ("mpy w5*w6,B,[w9]+=2,w5,[w10]+=2,w6"); asm volatile ("sac B,#-5,w1"); // W13=Q[i] // CORCONbits.US=0b01; // unsigned multiply, SERVE QUA?? e poi risistemare per sopra asm volatile ("exch w4,w0"); asm volatile ("exch w5,w1"); asm volatile ("mpy w4*w4,A"); asm volatile ("mac w5*w5,A"); asm volatile ("sac A,#1,[w7++]"); // W1=fftData[i] asm volatile ("exch w4,w0"); asm volatile ("exch w5,w1"); asm volatile ("mov w0,[w12++]"); asm volatile ("mov w1,[w13++]"); asm volatile (".myendfor2: nop"); /* if(SRbits.OAB) { // Accumulators A or B overflowed at some time SRbits.OAB=0; } if(SRbits.SAB) { // Accumulators A or B are saturated or have been saturated at some time SRbits.SAB=0; }*/
//Interrupt code *writePointer++ = (ADC1BUF0); *writePointer++ = (ADC1BUF1); *writePointer++ = (ADC1BUF2); *writePointer++ = (ADC1BUF3); *writePointer++ = (ADC1BUF4); *writePointer++ = (ADC1BUF5); *writePointer++ = (ADC1BUF6); *writePointer++ = (ADC1BUF7); *writePointer++ = (ADC1BUF8); *writePointer++ = (ADC1BUF9); *writePointer++ = (ADC1BUFA); *writePointer++ = (ADC1BUFB); *writePointer++ = (ADC1BUFC); *writePointer++ = (ADC1BUFD); *writePointer++ = (ADC1BUFE); *writePointer++ = (ADC1BUFF); counter+=16; if(counter>=SUB_BLOCK_LENGTH) { counter=0; ADCdone=1; // if(writePointer >= opBuffWindow+FFT_BLOCK_LENGTH+SUB_BLOCK_LENGTH) writePointer=opBuffWindow; }
|
|
|
Post by DarioG on Aug 6, 2021 18:38:43 GMT
some improvement: among the rest, it turns out that Q6.10 format may not be enough (yet) so I tried Q7.9 and it seems to work better with higher-level input signals. Hence
asm volatile ("mov #_readPointer,w0" : : : "w0","w1","w2","w4","w5","w6","w7","w8","w9","w10","w11","w12","w13"); // per clobbering registers https://www.eevblog.com/forum/microcontrollers/inserting-some-assembly-inside-c-code-for-pic24fj-series/ asm volatile ("mov [w0],w8"); // W8 = p asm volatile ("mov #_win,w10"); // W10 = pw asm volatile ("movsac A,[w8]+=2,w4,[w10]+=2,w5"); // W4 = *p, W5 = *pw asm volatile ("mov #_opBuffWindow+%0*2,w1" :: "i"(FFT_BLOCK_LENGTH+SUB_BLOCK_LENGTH)); // precalcolo fine loop asm volatile ("do #%0,.myenddo" :: "i"(FFT_BLOCK_LENGTH-1)); asm volatile ("mpy w4*w5,B,[w8]+=2,w4,[w10]+=2,w5"); asm volatile ("mov #_c,w9"); // W9 = &c // USARE cw e shiftare al volo?? asm volatile ("mov #_z1,w11"); // W11 = &z1 asm volatile ("mov #_z2,w12"); // W12 = &z2 asm volatile ("movsac A,[w9]+=2,w6,[w11],w7"); // W6 = c[i], W7 = sw[i] // non va! ah perché w11 è accettato solo in ymemory e lo prende così... asm volatile ("do #%0,.myendfor" :: "i"(MAX_SLOTS-1)); // asm volatile ("clr A"); asm volatile ("ADD A"); asm volatile ("mac w6*w7,A,[w9]+=2,w6"); asm volatile ("sac A,#%0,w0" :: "i"(-(15-Q1_FACTOR))); // W0=z0 asm volatile ("sub w0,[w12],w0"); asm volatile ("mov [w11],[w12++]"); //z2[i] = z1[i]; asm volatile ("mov w0,[w11++]"); //z1[i] = z0; asm volatile ("mov [w11],w7"); asm volatile (".myendfor: nop"); asm volatile ("cp w8,w1"); asm volatile ("BRA NZ, skip1"); asm volatile ("mov #_opBuffWindow,w8"); // W8 = p asm volatile ("movsac A,[w8]+=2,w4"); // W4 = *p asm volatile ("skip1:"); asm volatile (".myenddo: nop"); asm volatile ("mov #_readPointer,w2"); asm volatile ("mov [w2],w0"); asm volatile ("add #%0*2,w0" :: "i"(SUB_BLOCK_LENGTH)); //SUB_BLOCK_LENGTH asm volatile ("cp w0,w1"); // CP literal accetta max 32... asm volatile ("BRA NZ, skip2"); asm volatile ("mov #_opBuffWindow,w0"); asm volatile ("skip2:"); asm volatile ("mov w0,[w2]"); asm volatile ("mov #_cw,w8"); // W8 = &cw asm volatile ("mov #_sw,w9"); // W9 = &sw asm volatile ("mov #_z1,w10"); // W10 = &z1 asm volatile ("mov #_z2,w11"); // W11 = &z2 asm volatile ("mov #_I,w12"); // W12 = &I asm volatile ("mov #_Q,w13"); // W13 = &Q asm volatile ("mov #_fftData,w7"); // W7 = &fftData asm volatile ("movsac A,[w8]+=2,w4,[w10]+=2,w6"); // W4 = cw[i],W6 = z1[i] asm volatile ("mov [w9++],w5"); // W5 = sw[i] asm volatile ("do #%0,.myendfor2" :: "i"(MAX_SLOTS-1)); // asm volatile ("mpy w4*w6,A,[w8]+=2,w4"); asm volatile ("lac [w11++],#%0,B" :: "i"((15-Q1_FACTOR))); // z2[i] asm volatile ("SUB A"); // forse usare MSC e LAC.. no, può solo sottrarre da acc... asm volatile ("sac A,#%0,w0" :: "i"(-(15-Q1_FACTOR))); // W12=I[i] asm volatile ("mpy w5*w6,B,[w9]+=2,w5,[w10]+=2,w6"); asm volatile ("sac B,#%0,w1" :: "i"(-(15-Q1_FACTOR))); // W13=Q[i] // CORCONbits.US=0b01; // unsigned multiply, SERVE QUA?? e poi risistemare per sopra // DEVO mettere w12 e 13 in 4 e 5 per la mpy... però così poi devo ricaricarli e perdo il postinc di sopra asm volatile ("exch w4,w0"); asm volatile ("exch w5,w1"); asm volatile ("mpy w4*w4,A"); asm volatile ("mac w5*w5,A"); asm volatile ("sac A,#%0,[w7++]" :: "i"((Q1_FACTOR-9))); // W1=fftData[i] asm volatile ("exch w4,w0"); asm volatile ("exch w5,w1"); asm volatile ("mov w0,[w12++]"); asm volatile ("mov w1,[w13++]"); asm volatile (".myendfor2: nop");
and IRQ code
*writePointer++ = ((int)ADC1BUF0 >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUF1 >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUF2 >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUF3 >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUF4 >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUF5 >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUF6 >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUF7 >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUF8 >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUF9 >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUFA >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUFB >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUFC >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUFD >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUFE >> (10-Q1_FACTOR)); *writePointer++ = ((int)ADC1BUFF >> (10-Q1_FACTOR));
|
|