1 /* k6opt.s vector functions optimized for MMX extensions to x86
3 * Copyright (C) 1999 by Stanley J. Brooks <stabro@megsinet.net>
5 * Any use of this software is permitted provided that this notice is not
6 * removed and that neither the authors nor the Technische Universitaet Berlin
7 * are deemed to have made any representations as to the suitability of this
8 * software for any purpose nor are held responsible for any defects of
9 * this software. THERE IS ABSOLUTELY NO WARRANTY FOR THIS SOFTWARE;
10 * not even the implied warranty of MERCHANTABILITY or FITNESS FOR
11 * A PARTICULAR PURPOSE.
39 /* void Weighting_filter (const short *e, short *x) */
40 .globl Weighting_filter
41 .type Weighting_filter,@function
52 movl $0x1000,%eax; movd %eax,%mm5 /* for rounding */
59 movq (%ebx,%esi,2),%mm0
62 movq 8(%ebx,%esi,2),%mm4
66 movq 16(%ebx,%esi,2),%mm4
71 punpckhdq %mm0,%mm4 /* mm4 has high int32 of mm0 dup'd */
74 paddd %mm5,%mm0 /* add for roundoff */
77 movd %mm0,%eax /* ax has result */
78 movw %ax,(%edi,%esi,2)
89 .size Weighting_filter,.Lfe1-Weighting_filter
104 /* long k6maxcc(const short *wt, const short *dp, short *Nc_out) */
106 .type k6maxcc,@function
116 movl $0,%edx /* will be maximum inner-product */
118 movl %ebx,%ecx /* will be index of max inner-product */
136 punpckhdq %mm0,%mm1 /* mm1 has high int32 of mm0 dup'd */
138 movd %mm0,%eax /* eax has result */
160 .size k6maxcc,.Lfe2-k6maxcc
164 /* long k6iprod (const short *p, const short *q, int n) */
166 .type k6iprod,@function
177 leal -32(%esi,%eax,2),%edx /* edx = top - 32 */
179 cmpl %edx,%esi; ja .L202
190 cmpl %edx,%esi; jbe .L201
194 addl $24,%edx /* now edx = top-8 */
195 cmpl %edx,%esi; ja .L205
203 cmpl %edx,%esi; jbe .L203
207 addl $4,%edx /* now edx = top-4 */
208 cmpl %edx,%esi; ja .L207
220 addl $2,%edx /* now edx = top-2 */
221 cmpl %edx,%esi; ja .L209
233 punpckhdq %mm0,%mm1 /* mm1 has high int32 of mm0 dup'd */
235 movd %mm0,%eax /* eax has result */
243 .size k6iprod,.Lfe3-k6iprod
247 /* void k6vsraw P3((short *p, int n, int bits) */
249 .type k6vsraw,@function
256 andl %ecx,%ecx; jle .L399
258 leal -16(%esi,%eax,2),%edx /* edx = top - 16 */
262 psllw %mm3,%mm2; psrlw $1,%mm2
263 cmpl %edx,%esi; ja .L306
266 .L302: /* 8 words per iteration */
281 addl $12,%edx /* now edx = top-4 */
282 cmpl %edx,%esi; ja .L310
285 .L308: /* do up to 6 words, two at a time */
296 addl $2,%edx /* now edx = top-2 */
297 cmpl %edx,%esi; ja .L315
314 .size k6vsraw,.Lfe4-k6vsraw
317 /* void k6vsllw P3((short *p, int n, int bits) */
319 .type k6vsllw,@function
326 andl %ecx,%ecx; jle .L499
328 leal -16(%esi,%eax,2),%edx /* edx = top - 16 */
331 cmpl %edx,%esi; ja .L406
334 .L402: /* 8 words per iteration */
347 addl $12,%edx /* now edx = top-4 */
348 cmpl %edx,%esi; ja .L410
351 .L408: /* do up to 6 words, two at a time */
361 addl $2,%edx /* now edx = top-2 */
362 cmpl %edx,%esi; ja .L415
378 .size k6vsllw,.Lfe5-k6vsllw
383 .type extremes,@object
396 /* long k6maxmin (const short *p, int n, short *out) */
398 .type k6maxmin,@function
406 leal -8(%esi,%eax,2),%edx
416 movq (%esi),%mm0 /* mm0 will be max's */
417 movq %mm0,%mm1 /* mm1 will be min's */
427 pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */
429 pand %mm2,%mm3 /* mm3 is mm2 masked to new max's */
430 pandn %mm0,%mm4 /* mm4 is mm0 masked to its max's */
432 movq %mm4,%mm0 /* now mm0 is updated max's */
435 pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */
436 pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */
437 pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */
439 movq %mm2,%mm1 /* now mm1 is updated min's */
446 .L56: /* merge down the 4-word max/mins to lower 2 words */
451 pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */
452 pand %mm3,%mm2 /* mm2 is mm2 masked to new max's */
453 pandn %mm0,%mm3 /* mm3 is mm0 masked to its max's */
455 movq %mm2,%mm0 /* now mm0 is updated max's */
460 pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */
461 pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */
462 pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */
464 movq %mm2,%mm1 /* now mm1 is updated min's */
468 addl $4,%edx /* now dx = top-4 */
471 /* here, there are >= 2 words of input remaining */
475 pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */
477 pand %mm2,%mm3 /* mm3 is mm2 masked to new max's */
478 pandn %mm0,%mm4 /* mm4 is mm0 masked to its max's */
480 movq %mm4,%mm0 /* now mm0 is updated max's */
483 pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */
484 pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */
485 pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */
487 movq %mm2,%mm1 /* now mm1 is updated min's */
493 /* merge down the 2-word max/mins to 1 word */
498 pcmpgtw %mm0,%mm3 /* mm3 is bitmask for words where mm2 > mm0 */
499 pand %mm3,%mm2 /* mm2 is mm2 masked to new max's */
500 pandn %mm0,%mm3 /* mm3 is mm0 masked to its max's */
502 movd %mm2,%ecx /* cx is max so far */
507 pcmpgtw %mm2,%mm3 /* mm3 is bitmask for words where mm2 < mm1 */
508 pand %mm3,%mm2 /* mm2 is mm2 masked to new min's */
509 pandn %mm1,%mm3 /* mm3 is mm1 masked to its min's */
511 movd %mm2,%eax /* ax is min so far */
513 addl $2,%edx /* now dx = top-2 */
517 /* here, there is one word of input left */
528 .L65: /* (finally!) cx is the max, ax the min */
532 movl 16(%ebp),%edx /* ptr to output max,min vals */
533 andl %edx,%edx; jz .L77
534 movw %cx,(%edx) /* max */
535 movw %ax,2(%edx) /* min */
538 /* now calculate max absolute val */
550 .size k6maxmin,.Lfe6-k6maxmin
552 /* void Short_term_analysis_filtering (short *u0, const short *rp0, int kn, short *s) */
559 .equiv lv_rp,-40 /* local version of rp0 with each word twice */
561 .globl Short_term_analysis_filteringx
562 .type Short_term_analysis_filteringx,@function
563 Short_term_analysis_filteringx:
570 movl pm_rp0(%ebp),%esi;
571 leal lv_rp(%ebp),%edi;
584 punpckldq %mm4,%mm4 /* (0x00004000,0x00004000) for rounding dword product pairs */
586 movl pm_u0(%ebp),%eax
588 movl %eax,lv_u_top(%ebp) /* UTOP */
589 movl pm_s(%ebp),%edx /* edx is local s ptr throughout below */
590 movl pm_kn(%ebp),%eax
591 leal (%edx,%eax,2),%eax
592 movl %eax,lv_s_top(%ebp)
597 leal lv_rp(%ebp),%esi /* RP */
598 movl pm_u0(%ebp),%edi /* U */
599 movw (%edx),%ax /* (0,DI) */
601 movw (%edx),%ax /* (DI,DI) */
603 .L185: /* RP is %esi */
605 movw (%edi),%ax /* (DI,U) */
606 movd (%esi),%mm3 /* mm3 is (0,0,RP,RP) */
609 movd %eax,%mm2 /* mm2 is (0,0,DI,U) */
611 movd %eax,%mm1 /* mm1 is (0,0,U,DI) */
616 punpcklwd %mm1,%mm0 /* mm0 is (RP*U,RP*DI) */
617 paddd %mm4,%mm0 /* mm4 is 0x00004000,0x00004000 */
618 psrad $15,%mm0 /* (RP*U,RP*DI) adjusted */
619 packssdw %mm0,%mm0 /* (*,*,RP*U,RP*DI) adjusted and saturated to word */
620 paddsw %mm2,%mm0 /* mm0 is (?,?, DI', U') */
621 movd %mm0,%eax /* (DI,U') */
625 cmpl lv_u_top(%ebp),%edi
629 movw %ax,(%edx) /* last DI goes to *s */
630 addl $2,%edx /* next s */
631 cmpl lv_s_top(%ebp),%edx
641 .size Short_term_analysis_filteringx,.Lfe7-Short_term_analysis_filteringx
645 /* 'as' macro's seem to be case-insensitive */
648 movd \n(%esi),%mm3 /* mm3 is (0,0,RP,RP) */
650 movd (%esi),%mm3 /* mm3 is (0,0,RP,RP) */
653 movd %mm4,%ecx; movw %cx,%ax /* (DI,U) */
654 psllq $48,%mm1; psrlq $16,%mm4; por %mm1,%mm4
655 psllq $48,%mm0; psrlq $16,%mm5; por %mm0,%mm5
657 movd %eax,%mm2 /* mm2 is (0,0,DI,U) */
659 movd %eax,%mm1 /* mm1 is (0,0,U,DI) */
664 punpcklwd %mm1,%mm0 /* mm0 is (RP*U,RP*DI) */
665 paddd %mm6,%mm0 /* mm6 is 0x00004000,0x00004000 */
666 psrad $15,%mm0 /* (RP*U,RP*DI) adjusted */
667 packssdw %mm0,%mm0 /* (*,*,RP*U,RP*DI) adjusted and saturated to word */
668 paddsw %mm2,%mm0 /* mm0 is (?,?, DI', U') */
669 movd %mm0,%eax /* (DI,U') */
672 /* void Short_term_analysis_filtering (short *u0, const short *rp0, int kn, short *s) */
679 .equiv lv_rp,-40 /* local version of rp0 with each word twice */
681 .globl Short_term_analysis_filteringx
682 .type Short_term_analysis_filteringx,@function
683 Short_term_analysis_filteringx:
691 movl pm_rp0(%ebp),%esi;
692 leal lv_rp(%ebp),%edi;
702 movl %edi,lv_rp_top(%ebp)
707 punpckldq %mm6,%mm6 /* (0x00004000,0x00004000) for rounding dword product pairs */
709 movl pm_u0(%ebp),%ebx
710 movq (%ebx),%mm4; movq 8(%ebx),%mm5 /* the 8 u's */
711 movl pm_s(%ebp),%edx /* edx is local s ptr throughout below */
712 movl pm_kn(%ebp),%eax
713 leal (%edx,%eax,2),%eax
714 movl %eax,lv_s_top(%ebp)
719 leal lv_rp(%ebp),%esi /* RP */
720 movw (%edx),%ax /* (0,DI) */
722 movw (%edx),%ax /* (DI,DI) */
725 .L185: /* RP is %esi */
737 cmpl lv_rp_top(%ebp),%esi
741 movw %ax,(%edx) /* last DI goes to *s */
742 addl $2,%edx /* next s */
743 cmpl lv_s_top(%ebp),%edx
746 movq %mm4,(%ebx); movq %mm5,8(%ebx) /* the 8 u's */
754 .size Short_term_analysis_filteringx,.Lfe7-Short_term_analysis_filteringx
755 .ident "GCC: (GNU) 2.95.2 19991109 (Debian GNU/Linux)"