mul_1.S source code [glibc/sysdeps/alpha/mul_1.S]

1	# Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
2	# the result in a second limb vector.
3
4	# Copyright (C) 1992-2022 Free Software Foundation, Inc.
5
6	# This file is part of the GNU MP Library.
7
8	# The GNU MP Library is free software; you can redistribute it and/or modify
9	# it under the terms of the GNU Lesser General Public License as published by
10	# the Free Software Foundation; either version 2.1 of the License, or (at your
11	# option) any later version.
12
13	# The GNU MP Library is distributed in the hope that it will be useful, but
14	# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15	# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16	# License for more details.
17
18	# You should have received a copy of the GNU Lesser General Public License
19	# along with the GNU MP Library. If not, see <https://www.gnu.org/licenses/>.
20
21
22	# INPUT PARAMETERS
23	# res_ptr r16
24	# s1_ptr r17
25	# size r18
26	# s2_limb r19
27
28	# This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
29
30	# To improve performance for long multiplications, we would use
31	# 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
32	# these instructions without slowing down the general code: 1. We can
33	# only have two prefetches in operation at any time in the Alpha
34	# architecture. 2. There will seldom be any special alignment
35	# between RES_PTR and S1_PTR. Maybe we can simply divide the current
36	# loop into an inner and outer loop, having the inner loop handle
37	# exactly one prefetch block?
38
39	.set noreorder
40	.set noat
41	.text
42	.align `3`
43	.globl __mpn_mul_1
44	.ent __mpn_mul_1 `2`
45	__mpn_mul_1:
46	.frame $`30`,`0`,$`26`
47
48	ldq $`2`,`0`($`17`) # $`2` = s1_limb
49	subq $`18`,`1`,$`18` # size--
50	mulq $`2`,$`19`,$`3` # $`3` = prod_low
51	bic $`31`,$`31`,$`4` # clear cy_limb
52	umulh $`2`,$`19`,$`0` # $`0` = prod_high
53	beq $`18`,Lend1 # jump if size was == `1`
54	ldq $`2`,`8`($`17`) # $`2` = s1_limb
55	subq $`18`,`1`,$`18` # size--
56	stq $`3`,`0`($`16`)
57	beq $`18`,Lend2 # jump if size was == `2`
58
59	.align `3`
60	Loop: mulq $`2`,$`19`,$`3` # $`3` = prod_low
61	addq $`4`,$`0`,$`0` # cy_limb = cy_limb + `'cy'`
62	subq $`18`,`1`,$`18` # size--
63	umulh $`2`,$`19`,$`4` # $`4` = cy_limb
64	ldq $`2`,`16`($`17`) # $`2` = s1_limb
65	addq $`17`,`8`,$`17` # s1_ptr++
66	addq $`3`,$`0`,$`3` # $`3` = cy_limb + prod_low
67	stq $`3`,`8`($`16`)
68	cmpult $`3`,$`0`,$`0` # $`0` = carry from (cy_limb + prod_low)
69	addq $`16`,`8`,$`16` # res_ptr++
70	bne $`18`,Loop
71
72	Lend2: mulq $`2`,$`19`,$`3` # $`3` = prod_low
73	addq $`4`,$`0`,$`0` # cy_limb = cy_limb + `'cy'`
74	umulh $`2`,$`19`,$`4` # $`4` = cy_limb
75	addq $`3`,$`0`,$`3` # $`3` = cy_limb + prod_low
76	cmpult $`3`,$`0`,$`0` # $`0` = carry from (cy_limb + prod_low)
77	stq $`3`,`8`($`16`)
78	addq $`4`,$`0`,$`0` # cy_limb = prod_high + cy
79	ret $`31`,($`26`),`1`
80	Lend1: stq $`3`,`0`($`16`)
81	ret $`31`,($`26`),`1`
82
83	.end __mpn_mul_1
84

source code of glibc/sysdeps/alpha/mul_1.S