00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_q7.c 00009 * 00010 * Description: Convolution of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated 00028 * 00029 * Version 0.0.7 2010/06/10 00030 * Misra-C changes done 00031 * 00032 * -------------------------------------------------------------------- */ 00033 00034 #include "arm_math.h" 00035 00065 void arm_conv_q7( 00066 q7_t * pSrcA, 00067 uint32_t srcALen, 00068 q7_t * pSrcB, 00069 uint32_t srcBLen, 00070 q7_t * pDst) 00071 { 00072 00073 00074 #ifndef ARM_MATH_CM0 00075 00076 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00077 00078 q7_t *pIn1; /* inputA pointer */ 00079 q7_t *pIn2; /* inputB pointer */ 00080 q7_t *pOut = pDst; /* output pointer */ 00081 q7_t *px; /* Intermediate inputA pointer */ 00082 q7_t *py; /* Intermediate inputB pointer */ 00083 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00084 q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */ 00085 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00086 q31_t input1, input2; /* Temporary input variables */ 00087 q15_t in1, in2; /* Temporary input variables */ 00088 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00089 00090 00091 /* The algorithm implementation is based on the lengths of the inputs. */ 00092 /* srcB is always made to slide across srcA. */ 00093 /* So srcBLen is always considered as shorter or equal to srcALen */ 00094 if(srcALen >= srcBLen) 00095 { 00096 /* Initialization of inputA pointer */ 00097 pIn1 = pSrcA; 00098 00099 /* Initialization of inputB pointer */ 00100 pIn2 = pSrcB; 00101 } 00102 else 00103 { 00104 /* Initialization of inputA pointer */ 00105 pIn1 = pSrcB; 00106 00107 /* Initialization of inputB pointer */ 00108 pIn2 = pSrcA; 00109 00110 /* srcBLen is always considered as shorter or equal to srcALen */ 00111 j = srcBLen; 00112 srcBLen = srcALen; 00113 srcALen = j; 00114 } 00115 00116 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00117 /* The function is internally 00118 * divided into three stages according to the number of multiplications that has to be 00119 * taken place between inputA samples and inputB samples. In the first stage of the 00120 * algorithm, the multiplications increase by one for every iteration. 00121 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00122 * In the third stage of the algorithm, the multiplications decrease by one 00123 * for every iteration. */ 00124 00125 /* The algorithm is implemented in three stages. 00126 The loop counters of each stage is initiated here. */ 00127 blockSize1 = srcBLen - 1u; 00128 blockSize2 = (srcALen - srcBLen) + 1u; 00129 blockSize3 = blockSize1; 00130 00131 /* -------------------------- 00132 * Initializations of stage1 00133 * -------------------------*/ 00134 00135 /* sum = x[0] * y[0] 00136 * sum = x[0] * y[1] + x[1] * y[0] 00137 * .... 00138 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00139 */ 00140 00141 /* In this stage the MAC operations are increased by 1 for every iteration. 00142 The count variable holds the number of MAC operations performed */ 00143 count = 1u; 00144 00145 /* Working pointer of inputA */ 00146 px = pIn1; 00147 00148 /* Working pointer of inputB */ 00149 py = pIn2; 00150 00151 00152 /* ------------------------ 00153 * Stage1 process 00154 * ----------------------*/ 00155 00156 /* The first stage starts here */ 00157 while(blockSize1 > 0u) 00158 { 00159 /* Accumulator is made zero for every iteration */ 00160 sum = 0; 00161 00162 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00163 k = count >> 2u; 00164 00165 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00166 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00167 while(k > 0u) 00168 { 00169 /* x[0] , x[1] */ 00170 in1 = (q15_t) * px++; 00171 in2 = (q15_t) * px++; 00172 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00173 00174 /* y[srcBLen - 1] , y[srcBLen - 2] */ 00175 in1 = (q15_t) * py--; 00176 in2 = (q15_t) * py--; 00177 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00178 00179 /* x[0] * y[srcBLen - 1] */ 00180 /* x[1] * y[srcBLen - 2] */ 00181 sum = __SMLAD(input1, input2, sum); 00182 00183 /* x[2] , x[3] */ 00184 in1 = (q15_t) * px++; 00185 in2 = (q15_t) * px++; 00186 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00187 00188 /* y[srcBLen - 3] , y[srcBLen - 4] */ 00189 in1 = (q15_t) * py--; 00190 in2 = (q15_t) * py--; 00191 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00192 00193 /* x[2] * y[srcBLen - 3] */ 00194 /* x[3] * y[srcBLen - 4] */ 00195 sum = __SMLAD(input1, input2, sum); 00196 00197 /* Decrement the loop counter */ 00198 k--; 00199 } 00200 00201 /* If the count is not a multiple of 4, compute any remaining MACs here. 00202 ** No loop unrolling is used. */ 00203 k = count % 0x4u; 00204 00205 while(k > 0u) 00206 { 00207 /* Perform the multiply-accumulates */ 00208 sum += ((q15_t) * px++ * *py--); 00209 00210 /* Decrement the loop counter */ 00211 k--; 00212 } 00213 00214 /* Store the result in the accumulator in the destination buffer. */ 00215 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00216 00217 /* Update the inputA and inputB pointers for next MAC calculation */ 00218 py = pIn2 + count; 00219 px = pIn1; 00220 00221 /* Increment the MAC count */ 00222 count++; 00223 00224 /* Decrement the loop counter */ 00225 blockSize1--; 00226 } 00227 00228 /* -------------------------- 00229 * Initializations of stage2 00230 * ------------------------*/ 00231 00232 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00233 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00234 * .... 00235 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00236 */ 00237 00238 /* Working pointer of inputA */ 00239 px = pIn1; 00240 00241 /* Working pointer of inputB */ 00242 pSrc2 = pIn2 + (srcBLen - 1u); 00243 py = pSrc2; 00244 00245 /* count is index by which the pointer pIn1 to be incremented */ 00246 count = 1u; 00247 00248 /* ------------------- 00249 * Stage2 process 00250 * ------------------*/ 00251 00252 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00253 * So, to loop unroll over blockSize2, 00254 * srcBLen should be greater than or equal to 4 */ 00255 if(srcBLen >= 4u) 00256 { 00257 /* Loop unroll over blockSize2, by 4 */ 00258 blkCnt = blockSize2 >> 2u; 00259 00260 while(blkCnt > 0u) 00261 { 00262 /* Set all accumulators to zero */ 00263 acc0 = 0; 00264 acc1 = 0; 00265 acc2 = 0; 00266 acc3 = 0; 00267 00268 /* read x[0], x[1], x[2] samples */ 00269 x0 = *(px++); 00270 x1 = *(px++); 00271 x2 = *(px++); 00272 00273 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00274 k = srcBLen >> 2u; 00275 00276 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00277 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00278 do 00279 { 00280 /* Read y[srcBLen - 1] sample */ 00281 c0 = *(py--); 00282 /* Read y[srcBLen - 2] sample */ 00283 c1 = *(py--); 00284 00285 /* Read x[3] sample */ 00286 x3 = *(px++); 00287 00288 /* x[0] and x[1] are packed */ 00289 in1 = (q15_t) x0; 00290 in2 = (q15_t) x1; 00291 00292 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00293 00294 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ 00295 in1 = (q15_t) c0; 00296 in2 = (q15_t) c1; 00297 00298 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00299 00300 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00301 acc0 = __SMLAD(input1, input2, acc0); 00302 00303 /* x[1] and x[2] are packed */ 00304 in1 = (q15_t) x1; 00305 in2 = (q15_t) x2; 00306 00307 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00308 00309 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00310 acc1 = __SMLAD(input1, input2, acc1); 00311 00312 /* x[2] and x[3] are packed */ 00313 in1 = (q15_t) x2; 00314 in2 = (q15_t) x3; 00315 00316 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00317 00318 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00319 acc2 = __SMLAD(input1, input2, acc2); 00320 00321 /* Read x[4] sample */ 00322 x0 = *(px++); 00323 00324 /* x[3] and x[4] are packed */ 00325 in1 = (q15_t) x3; 00326 in2 = (q15_t) x0; 00327 00328 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00329 00330 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00331 acc3 = __SMLAD(input1, input2, acc3); 00332 00333 /* Read y[srcBLen - 3] sample */ 00334 c0 = *(py--); 00335 /* Read y[srcBLen - 4] sample */ 00336 c1 = *(py--); 00337 00338 /* Read x[5] sample */ 00339 x1 = *(px++); 00340 00341 /* x[2] and x[3] are packed */ 00342 in1 = (q15_t) x2; 00343 in2 = (q15_t) x3; 00344 00345 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00346 00347 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 00348 in1 = (q15_t) c0; 00349 in2 = (q15_t) c1; 00350 00351 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00352 00353 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00354 acc0 = __SMLAD(input1, input2, acc0); 00355 00356 /* x[3] and x[4] are packed */ 00357 in1 = (q15_t) x3; 00358 in2 = (q15_t) x0; 00359 00360 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00361 00362 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00363 acc1 = __SMLAD(input1, input2, acc1); 00364 00365 /* x[4] and x[5] are packed */ 00366 in1 = (q15_t) x0; 00367 in2 = (q15_t) x1; 00368 00369 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00370 00371 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00372 acc2 = __SMLAD(input1, input2, acc2); 00373 00374 /* Read x[6] sample */ 00375 x2 = *(px++); 00376 00377 /* x[5] and x[6] are packed */ 00378 in1 = (q15_t) x1; 00379 in2 = (q15_t) x2; 00380 00381 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00382 00383 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00384 acc3 = __SMLAD(input1, input2, acc3); 00385 00386 } while(--k); 00387 00388 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00389 ** No loop unrolling is used. */ 00390 k = srcBLen % 0x4u; 00391 00392 while(k > 0u) 00393 { 00394 /* Read y[srcBLen - 5] sample */ 00395 c0 = *(py--); 00396 00397 /* Read x[7] sample */ 00398 x3 = *(px++); 00399 00400 /* Perform the multiply-accumulates */ 00401 /* acc0 += x[4] * y[srcBLen - 5] */ 00402 acc0 += ((q15_t) x0 * c0); 00403 /* acc1 += x[5] * y[srcBLen - 5] */ 00404 acc1 += ((q15_t) x1 * c0); 00405 /* acc2 += x[6] * y[srcBLen - 5] */ 00406 acc2 += ((q15_t) x2 * c0); 00407 /* acc3 += x[7] * y[srcBLen - 5] */ 00408 acc3 += ((q15_t) x3 * c0); 00409 00410 /* Reuse the present samples for the next MAC */ 00411 x0 = x1; 00412 x1 = x2; 00413 x2 = x3; 00414 00415 /* Decrement the loop counter */ 00416 k--; 00417 } 00418 00419 00420 /* Store the result in the accumulator in the destination buffer. */ 00421 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00422 *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00423 *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00424 *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00425 00426 /* Update the inputA and inputB pointers for next MAC calculation */ 00427 px = pIn1 + (count * 4u); 00428 py = pSrc2; 00429 00430 /* Increment the pointer pIn1 index, count by 1 */ 00431 count++; 00432 00433 /* Decrement the loop counter */ 00434 blkCnt--; 00435 } 00436 00437 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00438 ** No loop unrolling is used. */ 00439 blkCnt = blockSize2 % 0x4u; 00440 00441 while(blkCnt > 0u) 00442 { 00443 /* Accumulator is made zero for every iteration */ 00444 sum = 0; 00445 00446 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00447 k = srcBLen >> 2u; 00448 00449 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00450 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00451 while(k > 0u) 00452 { 00453 00454 /* Reading two inputs of SrcA buffer and packing */ 00455 in1 = (q15_t) * px++; 00456 in2 = (q15_t) * px++; 00457 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00458 00459 /* Reading two inputs of SrcB buffer and packing */ 00460 in1 = (q15_t) * py--; 00461 in2 = (q15_t) * py--; 00462 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00463 00464 /* Perform the multiply-accumulates */ 00465 sum = __SMLAD(input1, input2, sum); 00466 00467 /* Reading two inputs of SrcA buffer and packing */ 00468 in1 = (q15_t) * px++; 00469 in2 = (q15_t) * px++; 00470 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00471 00472 /* Reading two inputs of SrcB buffer and packing */ 00473 in1 = (q15_t) * py--; 00474 in2 = (q15_t) * py--; 00475 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00476 00477 /* Perform the multiply-accumulates */ 00478 sum = __SMLAD(input1, input2, sum); 00479 00480 /* Decrement the loop counter */ 00481 k--; 00482 } 00483 00484 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00485 ** No loop unrolling is used. */ 00486 k = srcBLen % 0x4u; 00487 00488 while(k > 0u) 00489 { 00490 /* Perform the multiply-accumulates */ 00491 sum += ((q15_t) * px++ * *py--); 00492 00493 /* Decrement the loop counter */ 00494 k--; 00495 } 00496 00497 /* Store the result in the accumulator in the destination buffer. */ 00498 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00499 00500 /* Update the inputA and inputB pointers for next MAC calculation */ 00501 px = pIn1 + count; 00502 py = pSrc2; 00503 00504 /* Increment the pointer pIn1 index, count by 1 */ 00505 count++; 00506 00507 /* Decrement the loop counter */ 00508 blkCnt--; 00509 } 00510 } 00511 else 00512 { 00513 /* If the srcBLen is not a multiple of 4, 00514 * the blockSize2 loop cannot be unrolled by 4 */ 00515 blkCnt = blockSize2; 00516 00517 while(blkCnt > 0u) 00518 { 00519 /* Accumulator is made zero for every iteration */ 00520 sum = 0; 00521 00522 /* srcBLen number of MACS should be performed */ 00523 k = srcBLen; 00524 00525 while(k > 0u) 00526 { 00527 /* Perform the multiply-accumulate */ 00528 sum += ((q15_t) * px++ * *py--); 00529 00530 /* Decrement the loop counter */ 00531 k--; 00532 } 00533 00534 /* Store the result in the accumulator in the destination buffer. */ 00535 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00536 00537 /* Update the inputA and inputB pointers for next MAC calculation */ 00538 px = pIn1 + count; 00539 py = pSrc2; 00540 00541 /* Increment the MAC count */ 00542 count++; 00543 00544 /* Decrement the loop counter */ 00545 blkCnt--; 00546 } 00547 } 00548 00549 00550 /* -------------------------- 00551 * Initializations of stage3 00552 * -------------------------*/ 00553 00554 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00555 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00556 * .... 00557 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00558 * sum += x[srcALen-1] * y[srcBLen-1] 00559 */ 00560 00561 /* In this stage the MAC operations are decreased by 1 for every iteration. 00562 The blockSize3 variable holds the number of MAC operations performed */ 00563 00564 /* Working pointer of inputA */ 00565 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00566 px = pSrc1; 00567 00568 /* Working pointer of inputB */ 00569 pSrc2 = pIn2 + (srcBLen - 1u); 00570 py = pSrc2; 00571 00572 /* ------------------- 00573 * Stage3 process 00574 * ------------------*/ 00575 00576 while(blockSize3 > 0u) 00577 { 00578 /* Accumulator is made zero for every iteration */ 00579 sum = 0; 00580 00581 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00582 k = blockSize3 >> 2u; 00583 00584 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00585 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00586 while(k > 0u) 00587 { 00588 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 00589 in1 = (q15_t) * px++; 00590 in2 = (q15_t) * px++; 00591 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00592 00593 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 00594 in1 = (q15_t) * py--; 00595 in2 = (q15_t) * py--; 00596 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00597 00598 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00599 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00600 sum = __SMLAD(input1, input2, sum); 00601 00602 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 00603 in1 = (q15_t) * px++; 00604 in2 = (q15_t) * px++; 00605 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00606 00607 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 00608 in1 = (q15_t) * py--; 00609 in2 = (q15_t) * py--; 00610 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00611 00612 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00613 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00614 sum = __SMLAD(input1, input2, sum); 00615 00616 /* Decrement the loop counter */ 00617 k--; 00618 } 00619 00620 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00621 ** No loop unrolling is used. */ 00622 k = blockSize3 % 0x4u; 00623 00624 while(k > 0u) 00625 { 00626 /* Perform the multiply-accumulates */ 00627 sum += ((q15_t) * px++ * *py--); 00628 00629 /* Decrement the loop counter */ 00630 k--; 00631 } 00632 00633 /* Store the result in the accumulator in the destination buffer. */ 00634 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00635 00636 /* Update the inputA and inputB pointers for next MAC calculation */ 00637 px = ++pSrc1; 00638 py = pSrc2; 00639 00640 /* Decrement the loop counter */ 00641 blockSize3--; 00642 } 00643 00644 #else 00645 00646 /* Run the below code for Cortex-M0 */ 00647 00648 q7_t *pIn1 = pSrcA; /* input pointer */ 00649 q7_t *pIn2 = pSrcB; /* coefficient pointer */ 00650 q31_t sum; /* Accumulator */ 00651 uint32_t i, j; /* loop counter */ 00652 00653 /* Loop to calculate output of convolution for output length number of times */ 00654 for (i = 0; i < (srcALen + srcBLen - 1); i++) 00655 { 00656 /* Initialize sum with zero to carry on MAC operations */ 00657 sum = 0; 00658 00659 /* Loop to perform MAC operations according to convolution equation */ 00660 for (j = 0; j <= i; j++) 00661 { 00662 /* Check the array limitations */ 00663 if(((i - j) < srcBLen) && (j < srcALen)) 00664 { 00665 /* z[i] += x[i-j] * y[j] */ 00666 sum += (q15_t) pIn1[j] * (pIn2[i - j]); 00667 } 00668 } 00669 00670 /* Store the output in the destination buffer */ 00671 pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u); 00672 } 00673 00674 #endif /* #ifndef ARM_MATH_CM0 */ 00675 00676 } 00677