12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150 |
- // +build amd64,blsasm amd64,blsadx
- #include "textflag.h"
- // addition w/ modular reduction
- // a = (a + b) % p
- TEXT ·addAssign(SB), NOSPLIT, $0-16
- // |
- MOVQ a+0(FP), DI
- MOVQ b+8(FP), SI
- // |
- MOVQ (DI), R8
- MOVQ 8(DI), R9
- MOVQ 16(DI), R10
- MOVQ 24(DI), R11
- MOVQ 32(DI), R12
- MOVQ 40(DI), R13
- // |
- ADDQ (SI), R8
- ADCQ 8(SI), R9
- ADCQ 16(SI), R10
- ADCQ 24(SI), R11
- ADCQ 32(SI), R12
- ADCQ 40(SI), R13
- // |
- MOVQ R8, R14
- MOVQ R9, R15
- MOVQ R10, CX
- MOVQ R11, DX
- MOVQ R12, SI
- MOVQ R13, BX
- MOVQ $0xb9feffffffffaaab, AX
- SUBQ AX, R14
- MOVQ $0x1eabfffeb153ffff, AX
- SBBQ AX, R15
- MOVQ $0x6730d2a0f6b0f624, AX
- SBBQ AX, CX
- MOVQ $0x64774b84f38512bf, AX
- SBBQ AX, DX
- MOVQ $0x4b1ba7b6434bacd7, AX
- SBBQ AX, SI
- MOVQ $0x1a0111ea397fe69a, AX
- SBBQ AX, BX
- CMOVQCC R14, R8
- CMOVQCC R15, R9
- CMOVQCC CX, R10
- CMOVQCC DX, R11
- CMOVQCC SI, R12
- CMOVQCC BX, R13
- // |
- MOVQ R8, (DI)
- MOVQ R9, 8(DI)
- MOVQ R10, 16(DI)
- MOVQ R11, 24(DI)
- MOVQ R12, 32(DI)
- MOVQ R13, 40(DI)
- RET
- /* | end */
- // addition w/ modular reduction
- // c = (a + b) % p
- TEXT ·add(SB), NOSPLIT, $0-24
- // |
- MOVQ a+8(FP), DI
- MOVQ b+16(FP), SI
- // |
- MOVQ (DI), R8
- MOVQ 8(DI), R9
- MOVQ 16(DI), R10
- MOVQ 24(DI), R11
- MOVQ 32(DI), R12
- MOVQ 40(DI), R13
- // |
- ADDQ (SI), R8
- ADCQ 8(SI), R9
- ADCQ 16(SI), R10
- ADCQ 24(SI), R11
- ADCQ 32(SI), R12
- ADCQ 40(SI), R13
- // |
- MOVQ R8, R14
- MOVQ R9, R15
- MOVQ R10, CX
- MOVQ R11, DX
- MOVQ R12, SI
- MOVQ R13, BX
- MOVQ $0xb9feffffffffaaab, DI
- SUBQ DI, R14
- MOVQ $0x1eabfffeb153ffff, DI
- SBBQ DI, R15
- MOVQ $0x6730d2a0f6b0f624, DI
- SBBQ DI, CX
- MOVQ $0x64774b84f38512bf, DI
- SBBQ DI, DX
- MOVQ $0x4b1ba7b6434bacd7, DI
- SBBQ DI, SI
- MOVQ $0x1a0111ea397fe69a, DI
- SBBQ DI, BX
- CMOVQCC R14, R8
- CMOVQCC R15, R9
- CMOVQCC CX, R10
- CMOVQCC DX, R11
- CMOVQCC SI, R12
- CMOVQCC BX, R13
- // |
- MOVQ c+0(FP), DI
- MOVQ R8, (DI)
- MOVQ R9, 8(DI)
- MOVQ R10, 16(DI)
- MOVQ R11, 24(DI)
- MOVQ R12, 32(DI)
- MOVQ R13, 40(DI)
- RET
- /* | end */
- // addition w/o reduction check
- // c = (a + b)
- TEXT ·ladd(SB), NOSPLIT, $0-24
- // |
- MOVQ a+8(FP), DI
- MOVQ b+16(FP), SI
- // |
- MOVQ (DI), R8
- MOVQ 8(DI), R9
- MOVQ 16(DI), R10
- MOVQ 24(DI), R11
- MOVQ 32(DI), R12
- MOVQ 40(DI), R13
- // |
- ADDQ (SI), R8
- ADCQ 8(SI), R9
- ADCQ 16(SI), R10
- ADCQ 24(SI), R11
- ADCQ 32(SI), R12
- ADCQ 40(SI), R13
- // |
- MOVQ c+0(FP), DI
- MOVQ R8, (DI)
- MOVQ R9, 8(DI)
- MOVQ R10, 16(DI)
- MOVQ R11, 24(DI)
- MOVQ R12, 32(DI)
- MOVQ R13, 40(DI)
- RET
- /* | end */
- // addition w/o reduction check
- // a = a + b
- TEXT ·laddAssign(SB), NOSPLIT, $0-16
- // |
- MOVQ a+0(FP), DI
- MOVQ b+8(FP), SI
- // |
- MOVQ (DI), R8
- MOVQ 8(DI), R9
- MOVQ 16(DI), R10
- MOVQ 24(DI), R11
- MOVQ 32(DI), R12
- MOVQ 40(DI), R13
- // |
- ADDQ (SI), R8
- ADCQ 8(SI), R9
- ADCQ 16(SI), R10
- ADCQ 24(SI), R11
- ADCQ 32(SI), R12
- ADCQ 40(SI), R13
- // |
- MOVQ a+0(FP), DI
- MOVQ R8, (DI)
- MOVQ R9, 8(DI)
- MOVQ R10, 16(DI)
- MOVQ R11, 24(DI)
- MOVQ R12, 32(DI)
- MOVQ R13, 40(DI)
- RET
- /* | end */
- // subtraction w/ modular reduction
- // c = (a - b) % p
- TEXT ·sub(SB), NOSPLIT, $0-24
- // |
- MOVQ a+8(FP), DI
- MOVQ b+16(FP), SI
- XORQ AX, AX
- // |
- MOVQ (DI), R8
- MOVQ 8(DI), R9
- MOVQ 16(DI), R10
- MOVQ 24(DI), R11
- MOVQ 32(DI), R12
- MOVQ 40(DI), R13
- SUBQ (SI), R8
- SBBQ 8(SI), R9
- SBBQ 16(SI), R10
- SBBQ 24(SI), R11
- SBBQ 32(SI), R12
- SBBQ 40(SI), R13
- // |
- MOVQ $0xb9feffffffffaaab, R14
- MOVQ $0x1eabfffeb153ffff, R15
- MOVQ $0x6730d2a0f6b0f624, CX
- MOVQ $0x64774b84f38512bf, DX
- MOVQ $0x4b1ba7b6434bacd7, SI
- MOVQ $0x1a0111ea397fe69a, BX
- CMOVQCC AX, R14
- CMOVQCC AX, R15
- CMOVQCC AX, CX
- CMOVQCC AX, DX
- CMOVQCC AX, SI
- CMOVQCC AX, BX
- ADDQ R14, R8
- ADCQ R15, R9
- ADCQ CX, R10
- ADCQ DX, R11
- ADCQ SI, R12
- ADCQ BX, R13
- // |
- MOVQ c+0(FP), DI
- MOVQ R8, (DI)
- MOVQ R9, 8(DI)
- MOVQ R10, 16(DI)
- MOVQ R11, 24(DI)
- MOVQ R12, 32(DI)
- MOVQ R13, 40(DI)
- RET
- /* | end */
- // subtraction w/ modular reduction
- // a = (a - b) % p
- TEXT ·subAssign(SB), NOSPLIT, $0-16
- // |
- MOVQ a+0(FP), DI
- MOVQ b+8(FP), SI
- XORQ AX, AX
- // |
- MOVQ (DI), R8
- MOVQ 8(DI), R9
- MOVQ 16(DI), R10
- MOVQ 24(DI), R11
- MOVQ 32(DI), R12
- MOVQ 40(DI), R13
- SUBQ (SI), R8
- SBBQ 8(SI), R9
- SBBQ 16(SI), R10
- SBBQ 24(SI), R11
- SBBQ 32(SI), R12
- SBBQ 40(SI), R13
- // |
- MOVQ $0xb9feffffffffaaab, R14
- MOVQ $0x1eabfffeb153ffff, R15
- MOVQ $0x6730d2a0f6b0f624, CX
- MOVQ $0x64774b84f38512bf, DX
- MOVQ $0x4b1ba7b6434bacd7, SI
- MOVQ $0x1a0111ea397fe69a, BX
- CMOVQCC AX, R14
- CMOVQCC AX, R15
- CMOVQCC AX, CX
- CMOVQCC AX, DX
- CMOVQCC AX, SI
- CMOVQCC AX, BX
- ADDQ R14, R8
- ADCQ R15, R9
- ADCQ CX, R10
- ADCQ DX, R11
- ADCQ SI, R12
- ADCQ BX, R13
- // |
- MOVQ a+0(FP), DI
- MOVQ R8, (DI)
- MOVQ R9, 8(DI)
- MOVQ R10, 16(DI)
- MOVQ R11, 24(DI)
- MOVQ R12, 32(DI)
- MOVQ R13, 40(DI)
- RET
- /* | end */
- // subtraction w/o reduction check
- // a = (a - b)
- TEXT ·lsubAssign(SB), NOSPLIT, $0-16
- // |
- MOVQ a+0(FP), DI
- MOVQ b+8(FP), SI
- // |
- MOVQ (DI), R8
- MOVQ 8(DI), R9
- MOVQ 16(DI), R10
- MOVQ 24(DI), R11
- MOVQ 32(DI), R12
- MOVQ 40(DI), R13
- SUBQ (SI), R8
- SBBQ 8(SI), R9
- SBBQ 16(SI), R10
- SBBQ 24(SI), R11
- SBBQ 32(SI), R12
- SBBQ 40(SI), R13
-
- // |
- MOVQ a+0(FP), DI
- MOVQ R8, (DI)
- MOVQ R9, 8(DI)
- MOVQ R10, 16(DI)
- MOVQ R11, 24(DI)
- MOVQ R12, 32(DI)
- MOVQ R13, 40(DI)
- RET
- /* | end */
- // doubling w/ reduction
- // c = (2 * a) % p
- TEXT ·double(SB), NOSPLIT, $0-16
- // |
- MOVQ a+8(FP), DI
- MOVQ (DI), R8
- MOVQ 8(DI), R9
- MOVQ 16(DI), R10
- MOVQ 24(DI), R11
- MOVQ 32(DI), R12
- MOVQ 40(DI), R13
- ADDQ R8, R8
- ADCQ R9, R9
- ADCQ R10, R10
- ADCQ R11, R11
- ADCQ R12, R12
- ADCQ R13, R13
- // |
- MOVQ R8, R14
- MOVQ R9, R15
- MOVQ R10, CX
- MOVQ R11, DX
- MOVQ R12, SI
- MOVQ R13, BX
- MOVQ $0xb9feffffffffaaab, DI
- SUBQ DI, R14
- MOVQ $0x1eabfffeb153ffff, DI
- SBBQ DI, R15
- MOVQ $0x6730d2a0f6b0f624, DI
- SBBQ DI, CX
- MOVQ $0x64774b84f38512bf, DI
- SBBQ DI, DX
- MOVQ $0x4b1ba7b6434bacd7, DI
- SBBQ DI, SI
- MOVQ $0x1a0111ea397fe69a, DI
- SBBQ DI, BX
- CMOVQCC R14, R8
- CMOVQCC R15, R9
- CMOVQCC CX, R10
- CMOVQCC DX, R11
- CMOVQCC SI, R12
- CMOVQCC BX, R13
- // |
- MOVQ c+0(FP), DI
- MOVQ R8, (DI)
- MOVQ R9, 8(DI)
- MOVQ R10, 16(DI)
- MOVQ R11, 24(DI)
- MOVQ R12, 32(DI)
- MOVQ R13, 40(DI)
- RET
- /* | end */
- // doubling w/ reduction
- // a = (2 * a) % p
- TEXT ·doubleAssign(SB), NOSPLIT, $0-8
- // |
- MOVQ a+0(FP), DI
- MOVQ (DI), R8
- MOVQ 8(DI), R9
- MOVQ 16(DI), R10
- MOVQ 24(DI), R11
- MOVQ 32(DI), R12
- MOVQ 40(DI), R13
- ADDQ R8, R8
- ADCQ R9, R9
- ADCQ R10, R10
- ADCQ R11, R11
- ADCQ R12, R12
- ADCQ R13, R13
- // |
- MOVQ R8, R14
- MOVQ R9, R15
- MOVQ R10, CX
- MOVQ R11, DX
- MOVQ R12, SI
- MOVQ R13, BX
- MOVQ $0xb9feffffffffaaab, AX
- SUBQ AX, R14
- MOVQ $0x1eabfffeb153ffff, AX
- SBBQ AX, R15
- MOVQ $0x6730d2a0f6b0f624, AX
- SBBQ AX, CX
- MOVQ $0x64774b84f38512bf, AX
- SBBQ AX, DX
- MOVQ $0x4b1ba7b6434bacd7, AX
- SBBQ AX, SI
- MOVQ $0x1a0111ea397fe69a, AX
- SBBQ AX, BX
- CMOVQCC R14, R8
- CMOVQCC R15, R9
- CMOVQCC CX, R10
- CMOVQCC DX, R11
- CMOVQCC SI, R12
- CMOVQCC BX, R13
- MOVQ R8, (DI)
- MOVQ R9, 8(DI)
- MOVQ R10, 16(DI)
- MOVQ R11, 24(DI)
- MOVQ R12, 32(DI)
- MOVQ R13, 40(DI)
- RET
- /* | end */
- // doubling w/o reduction
- // c = 2 * a
- TEXT ·ldouble(SB), NOSPLIT, $0-16
- // |
- MOVQ a+8(FP), DI
- MOVQ (DI), R8
- MOVQ 8(DI), R9
- MOVQ 16(DI), R10
- MOVQ 24(DI), R11
- MOVQ 32(DI), R12
- MOVQ 40(DI), R13
- // |
- ADDQ R8, R8
- ADCQ R9, R9
- ADCQ R10, R10
- ADCQ R11, R11
- ADCQ R12, R12
- ADCQ R13, R13
- // |
- MOVQ c+0(FP), DI
- MOVQ R8, (DI)
- MOVQ R9, 8(DI)
- MOVQ R10, 16(DI)
- MOVQ R11, 24(DI)
- MOVQ R12, 32(DI)
- MOVQ R13, 40(DI)
- RET
- /* | end */
- TEXT ·_neg(SB), NOSPLIT, $0-16
- // |
- MOVQ a+8(FP), DI
- // |
- MOVQ $0xb9feffffffffaaab, R8
- MOVQ $0x1eabfffeb153ffff, R9
- MOVQ $0x6730d2a0f6b0f624, R10
- MOVQ $0x64774b84f38512bf, R11
- MOVQ $0x4b1ba7b6434bacd7, R12
- MOVQ $0x1a0111ea397fe69a, R13
- SUBQ (DI), R8
- SBBQ 8(DI), R9
- SBBQ 16(DI), R10
- SBBQ 24(DI), R11
- SBBQ 32(DI), R12
- SBBQ 40(DI), R13
- // |
- MOVQ c+0(FP), DI
- MOVQ R8, (DI)
- MOVQ R9, 8(DI)
- MOVQ R10, 16(DI)
- MOVQ R11, 24(DI)
- MOVQ R12, 32(DI)
- MOVQ R13, 40(DI)
- RET
- /* | end */
- // multiplication without using MULX/ADX
- // c = a * b % p
- TEXT ·mulNoADX(SB), NOSPLIT, $24-24
- // |
- /* inputs */
- MOVQ a+8(FP), DI
- MOVQ b+16(FP), SI
- MOVQ $0x00, R9
- MOVQ $0x00, R10
- MOVQ $0x00, R11
- MOVQ $0x00, R12
- MOVQ $0x00, R13
- MOVQ $0x00, R14
- MOVQ $0x00, R15
- // |
- /* i0 */
- // | a0 @ CX
- MOVQ (DI), CX
- // | a0 * b0
- MOVQ (SI), AX
- MULQ CX
- MOVQ AX, (SP)
- MOVQ DX, R8
- // | a0 * b1
- MOVQ 8(SI), AX
- MULQ CX
- ADDQ AX, R8
- ADCQ DX, R9
- // | a0 * b2
- MOVQ 16(SI), AX
- MULQ CX
- ADDQ AX, R9
- ADCQ DX, R10
- // | a0 * b3
- MOVQ 24(SI), AX
- MULQ CX
- ADDQ AX, R10
- ADCQ DX, R11
- // | a0 * b4
- MOVQ 32(SI), AX
- MULQ CX
- ADDQ AX, R11
- ADCQ DX, R12
- // | a0 * b5
- MOVQ 40(SI), AX
- MULQ CX
- ADDQ AX, R12
- ADCQ DX, R13
- // |
- /* i1 */
- // | a1 @ CX
- MOVQ 8(DI), CX
- MOVQ $0x00, BX
- // | a1 * b0
- MOVQ (SI), AX
- MULQ CX
- ADDQ AX, R8
- ADCQ DX, R9
- ADCQ $0x00, R10
- ADCQ $0x00, BX
- MOVQ R8, 8(SP)
- MOVQ $0x00, R8
- // | a1 * b1
- MOVQ 8(SI), AX
- MULQ CX
- ADDQ AX, R9
- ADCQ DX, R10
- ADCQ BX, R11
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a1 * b2
- MOVQ 16(SI), AX
- MULQ CX
- ADDQ AX, R10
- ADCQ DX, R11
- ADCQ BX, R12
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a1 * b3
- MOVQ 24(SI), AX
- MULQ CX
- ADDQ AX, R11
- ADCQ DX, R12
- ADCQ BX, R13
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a1 * b4
- MOVQ 32(SI), AX
- MULQ CX
- ADDQ AX, R12
- ADCQ DX, R13
- ADCQ BX, R14
- // | a1 * b5
- MOVQ 40(SI), AX
- MULQ CX
- ADDQ AX, R13
- ADCQ DX, R14
- // |
- /* i2 */
- // | a2 @ CX
- MOVQ 16(DI), CX
- MOVQ $0x00, BX
- // | a2 * b0
- MOVQ (SI), AX
- MULQ CX
- ADDQ AX, R9
- ADCQ DX, R10
- ADCQ $0x00, R11
- ADCQ $0x00, BX
- MOVQ R9, 16(SP)
- MOVQ $0x00, R9
- // | a2 * b1
- MOVQ 8(SI), AX
- MULQ CX
- ADDQ AX, R10
- ADCQ DX, R11
- ADCQ BX, R12
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a2 * b2
- MOVQ 16(SI), AX
- MULQ CX
- ADDQ AX, R11
- ADCQ DX, R12
- ADCQ BX, R13
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a2 * b3
- MOVQ 24(SI), AX
- MULQ CX
- ADDQ AX, R12
- ADCQ DX, R13
- ADCQ BX, R14
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a2 * b4
- MOVQ 32(SI), AX
- MULQ CX
- ADDQ AX, R13
- ADCQ DX, R14
- ADCQ BX, R15
- // | a2 * b5
- MOVQ 40(SI), AX
- MULQ CX
- ADDQ AX, R14
- ADCQ DX, R15
- // |
- /* i3 */
- // | a3 @ CX
- MOVQ 24(DI), CX
- MOVQ $0x00, BX
- // | a3 * b0
- MOVQ (SI), AX
- MULQ CX
- ADDQ AX, R10
- ADCQ DX, R11
- ADCQ $0x00, R12
- ADCQ $0x00, BX
- // | a3 * b1
- MOVQ 8(SI), AX
- MULQ CX
- ADDQ AX, R11
- ADCQ DX, R12
- ADCQ BX, R13
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a3 * b2
- MOVQ 16(SI), AX
- MULQ CX
- ADDQ AX, R12
- ADCQ DX, R13
- ADCQ BX, R14
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a3 * b3
- MOVQ 24(SI), AX
- MULQ CX
- ADDQ AX, R13
- ADCQ DX, R14
- ADCQ BX, R15
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a3 * b4
- MOVQ 32(SI), AX
- MULQ CX
- ADDQ AX, R14
- ADCQ DX, R15
- ADCQ BX, R8
- // | a3 * b5
- MOVQ 40(SI), AX
- MULQ CX
- ADDQ AX, R15
- ADCQ DX, R8
- // |
- /* i4 */
- // | a4 @ CX
- MOVQ 32(DI), CX
- MOVQ $0x00, BX
- // | a4 * b0
- MOVQ (SI), AX
- MULQ CX
- ADDQ AX, R11
- ADCQ DX, R12
- ADCQ $0x00, R13
- ADCQ $0x00, BX
- // | a4 * b1
- MOVQ 8(SI), AX
- MULQ CX
- ADDQ AX, R12
- ADCQ DX, R13
- ADCQ BX, R14
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a4 * b2
- MOVQ 16(SI), AX
- MULQ CX
- ADDQ AX, R13
- ADCQ DX, R14
- ADCQ BX, R15
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a4 * b3
- MOVQ 24(SI), AX
- MULQ CX
- ADDQ AX, R14
- ADCQ DX, R15
- ADCQ BX, R8
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a4 * b4
- MOVQ 32(SI), AX
- MULQ CX
- ADDQ AX, R15
- ADCQ DX, R8
- ADCQ BX, R9
- // | a4 * b5
- MOVQ 40(SI), AX
- MULQ CX
- ADDQ AX, R8
- ADCQ DX, R9
- // |
- /* i5 */
- // | a5 @ CX
- MOVQ 40(DI), CX
- MOVQ $0x00, BX
- // | a5 * b0
- MOVQ (SI), AX
- MULQ CX
- ADDQ AX, R12
- ADCQ DX, R13
- ADCQ $0x00, R14
- ADCQ $0x00, BX
- // | a5 * b1
- MOVQ 8(SI), AX
- MULQ CX
- ADDQ AX, R13
- ADCQ DX, R14
- ADCQ BX, R15
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a5 * b2
- MOVQ 16(SI), AX
- MULQ CX
- ADDQ AX, R14
- ADCQ DX, R15
- ADCQ BX, R8
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a5 * b3
- MOVQ 24(SI), AX
- MULQ CX
- ADDQ AX, R15
- ADCQ DX, R8
- ADCQ BX, R9
- MOVQ $0x00, BX
- ADCQ $0x00, BX
- // | a5 * b4
- MOVQ 32(SI), AX
- MULQ CX
- ADDQ AX, R8
- ADCQ DX, R9
- ADCQ $0x00, BX
- // | a5 * b5
- MOVQ 40(SI), AX
- MULQ CX
- ADDQ AX, R9
- ADCQ DX, BX
- // |
- /* */
- // |
- // | W
- // | 0 (SP) | 1 8(SP) | 2 16(SP) | 3 R10 | 4 R11 | 5 R12
- // | 6 R13 | 7 R14 | 8 R15 | 9 R8 | 10 R9 | 11 BX
- MOVQ (SP), CX
- MOVQ 8(SP), DI
- MOVQ 16(SP), SI
- MOVQ BX, (SP)
- MOVQ R9, 8(SP)
- // |
- /* montgomery reduction */
- // |
- /* i0 */
- // |
- // | W
- // | 0 CX | 1 DI | 2 SI | 3 R10 | 4 R11 | 5 R12
- // | 6 R13 | 7 R14 | 8 R15 | 9 R8 | 10 8(SP) | 11 (SP)
- // | | u0 = w0 * inp
- MOVQ CX, AX
- MULQ ·inp+0(SB)
- MOVQ AX, R9
- MOVQ $0x00, BX
- // |
- /* */
- // | j0
- // | w0 @ CX
- MOVQ ·modulus+0(SB), AX
- MULQ R9
- ADDQ AX, CX
- ADCQ DX, BX
- // | j1
- // | w1 @ DI
- MOVQ ·modulus+8(SB), AX
- MULQ R9
- ADDQ AX, DI
- ADCQ $0x00, DX
- ADDQ BX, DI
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j2
- // | w2 @ SI
- MOVQ ·modulus+16(SB), AX
- MULQ R9
- ADDQ AX, SI
- ADCQ $0x00, DX
- ADDQ BX, SI
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j3
- // | w3 @ R10
- MOVQ ·modulus+24(SB), AX
- MULQ R9
- ADDQ AX, R10
- ADCQ $0x00, DX
- ADDQ BX, R10
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j4
- // | w4 @ R11
- MOVQ ·modulus+32(SB), AX
- MULQ R9
- ADDQ AX, R11
- ADCQ $0x00, DX
- ADDQ BX, R11
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j5
- // | w5 @ R12
- MOVQ ·modulus+40(SB), AX
- MULQ R9
- ADDQ AX, R12
- ADCQ $0x00, DX
- ADDQ BX, R12
- // | w6 @ R13
- ADCQ DX, R13
- ADCQ $0x00, CX
- // |
- /* i1 */
- // |
- // | W
- // | 0 - | 1 DI | 2 SI | 3 R10 | 4 R11 | 5 R12
- // | 6 R13 | 7 R14 | 8 R15 | 9 R8 | 10 8(SP) | 11 (SP)
- // | | u1 = w1 * inp
- MOVQ DI, AX
- MULQ ·inp+0(SB)
- MOVQ AX, R9
- MOVQ $0x00, BX
- // |
- /* */
- // | j0
- // | w1 @ DI
- MOVQ ·modulus+0(SB), AX
- MULQ R9
- ADDQ AX, DI
- ADCQ DX, BX
- // | j1
- // | w2 @ SI
- MOVQ ·modulus+8(SB), AX
- MULQ R9
- ADDQ AX, SI
- ADCQ $0x00, DX
- ADDQ BX, SI
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j2
- // | w3 @ R10
- MOVQ ·modulus+16(SB), AX
- MULQ R9
- ADDQ AX, R10
- ADCQ $0x00, DX
- ADDQ BX, R10
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j3
- // | w4 @ R11
- MOVQ ·modulus+24(SB), AX
- MULQ R9
- ADDQ AX, R11
- ADCQ $0x00, DX
- ADDQ BX, R11
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j4
- // | w5 @ R12
- MOVQ ·modulus+32(SB), AX
- MULQ R9
- ADDQ AX, R12
- ADCQ $0x00, DX
- ADDQ BX, R12
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j5
- // | w6 @ R13
- MOVQ ·modulus+40(SB), AX
- MULQ R9
- ADDQ AX, R13
- ADCQ DX, CX
- ADDQ BX, R13
- // | w7 @ R14
- ADCQ CX, R14
- MOVQ $0x00, CX
- ADCQ $0x00, CX
- // |
- /* i2 */
- // |
- // | W
- // | 0 - | 1 - | 2 SI | 3 R10 | 4 R11 | 5 R12
- // | 6 R13 | 7 R14 | 8 R15 | 9 R8 | 10 8(SP) | 11 (SP)
- // | | u2 = w2 * inp
- MOVQ SI, AX
- MULQ ·inp+0(SB)
- MOVQ AX, R9
- MOVQ $0x00, BX
- // |
- /* */
- // | j0
- // | w2 @ SI
- MOVQ ·modulus+0(SB), AX
- MULQ R9
- ADDQ AX, SI
- ADCQ DX, BX
- // | j1
- // | w3 @ R10
- MOVQ ·modulus+8(SB), AX
- MULQ R9
- ADDQ AX, R10
- ADCQ $0x00, DX
- ADDQ BX, R10
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j2
- // | w4 @ R11
- MOVQ ·modulus+16(SB), AX
- MULQ R9
- ADDQ AX, R11
- ADCQ $0x00, DX
- ADDQ BX, R11
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j3
- // | w5 @ R12
- MOVQ ·modulus+24(SB), AX
- MULQ R9
- ADDQ AX, R12
- ADCQ $0x00, DX
- ADDQ BX, R12
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j4
- // | w6 @ R13
- MOVQ ·modulus+32(SB), AX
- MULQ R9
- ADDQ AX, R13
- ADCQ $0x00, DX
- ADDQ BX, R13
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j5
- // | w7 @ R14
- MOVQ ·modulus+40(SB), AX
- MULQ R9
- ADDQ AX, R14
- ADCQ DX, CX
- ADDQ BX, R14
- // | w8 @ R15
- ADCQ CX, R15
- MOVQ $0x00, CX
- ADCQ $0x00, CX
- // |
- /* i3 */
- // |
- // | W
- // | 0 - | 1 - | 2 - | 3 R10 | 4 R11 | 5 R12
- // | 6 R13 | 7 R14 | 8 R15 | 9 R8 | 10 8(SP) | 11 (SP)
- // | | u3 = w3 * inp
- MOVQ R10, AX
- MULQ ·inp+0(SB)
- MOVQ AX, R9
- MOVQ $0x00, BX
- // |
- /* */
- // | j0
- // | w3 @ R10
- MOVQ ·modulus+0(SB), AX
- MULQ R9
- ADDQ AX, R10
- ADCQ DX, BX
- // | j1
- // | w4 @ R11
- MOVQ ·modulus+8(SB), AX
- MULQ R9
- ADDQ AX, R11
- ADCQ $0x00, DX
- ADDQ BX, R11
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j2
- // | w5 @ R12
- MOVQ ·modulus+16(SB), AX
- MULQ R9
- ADDQ AX, R12
- ADCQ $0x00, DX
- ADDQ BX, R12
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j3
- // | w6 @ R13
- MOVQ ·modulus+24(SB), AX
- MULQ R9
- ADDQ AX, R13
- ADCQ $0x00, DX
- ADDQ BX, R13
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j4
- // | w7 @ R14
- MOVQ ·modulus+32(SB), AX
- MULQ R9
- ADDQ AX, R14
- ADCQ $0x00, DX
- ADDQ BX, R14
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j5
- // | w8 @ R15
- MOVQ ·modulus+40(SB), AX
- MULQ R9
- ADDQ AX, R15
- ADCQ DX, CX
- ADDQ BX, R15
- // | w9 @ R8
- ADCQ CX, R8
- MOVQ $0x00, CX
- ADCQ $0x00, CX
- // |
- /* i4 */
- // |
- // | W
- // | 0 - | 1 - | 2 - | 3 - | 4 R11 | 5 R12
- // | 6 R13 | 7 R14 | 8 R15 | 9 R8 | 10 8(SP) | 11 (SP)
- // | | u4 = w4 * inp
- MOVQ R11, AX
- MULQ ·inp+0(SB)
- MOVQ AX, R9
- MOVQ $0x00, BX
- // |
- /* */
- // | j0
- // | w4 @ R11
- MOVQ ·modulus+0(SB), AX
- MULQ R9
- ADDQ AX, R11
- ADCQ DX, BX
- // | j1
- // | w5 @ R12
- MOVQ ·modulus+8(SB), AX
- MULQ R9
- ADDQ AX, R12
- ADCQ $0x00, DX
- ADDQ BX, R12
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j2
- // | w6 @ R13
- MOVQ ·modulus+16(SB), AX
- MULQ R9
- ADDQ AX, R13
- ADCQ $0x00, DX
- ADDQ BX, R13
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j3
- // | w7 @ R14
- MOVQ ·modulus+24(SB), AX
- MULQ R9
- ADDQ AX, R14
- ADCQ $0x00, DX
- ADDQ BX, R14
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j4
- // | w8 @ R15
- MOVQ ·modulus+32(SB), AX
- MULQ R9
- ADDQ AX, R15
- ADCQ $0x00, DX
- ADDQ BX, R15
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j5
- // | w9 @ R8
- MOVQ ·modulus+40(SB), AX
- MULQ R9
- ADDQ AX, R8
- ADCQ DX, CX
- ADDQ BX, R8
- // | move to idle register
- MOVQ 8(SP), DI
- // | w10 @ DI
- ADCQ CX, DI
- MOVQ $0x00, CX
- ADCQ $0x00, CX
- // |
- /* i5 */
- // |
- // | W
- // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R12
- // | 6 R13 | 7 R14 | 8 R15 | 9 R8 | 10 DI | 11 (SP)
- // | | u5 = w5 * inp
- MOVQ R12, AX
- MULQ ·inp+0(SB)
- MOVQ AX, R9
- MOVQ $0x00, BX
- // |
- /* */
- // | j0
- // | w5 @ R12
- MOVQ ·modulus+0(SB), AX
- MULQ R9
- ADDQ AX, R12
- ADCQ DX, BX
- // | j1
- // | w6 @ R13
- MOVQ ·modulus+8(SB), AX
- MULQ R9
- ADDQ AX, R13
- ADCQ $0x00, DX
- ADDQ BX, R13
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j2
- // | w7 @ R14
- MOVQ ·modulus+16(SB), AX
- MULQ R9
- ADDQ AX, R14
- ADCQ $0x00, DX
- ADDQ BX, R14
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j3
- // | w8 @ R15
- MOVQ ·modulus+24(SB), AX
- MULQ R9
- ADDQ AX, R15
- ADCQ $0x00, DX
- ADDQ BX, R15
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j4
- // | w9 @ R8
- MOVQ ·modulus+32(SB), AX
- MULQ R9
- ADDQ AX, R8
- ADCQ $0x00, DX
- ADDQ BX, R8
- MOVQ $0x00, BX
- ADCQ DX, BX
- // | j5
- // | w10 @ DI
- MOVQ ·modulus+40(SB), AX
- MULQ R9
- ADDQ AX, DI
- ADCQ DX, CX
- ADDQ BX, DI
- // | w11 @ CX
- ADCQ (SP), CX
- // |
- // | W montgomerry reduction ends
- // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 -
- // | 6 R13 | 7 R14 | 8 R15 | 9 R8 | 10 DI | 11 CX
- // |
- /* modular reduction */
- MOVQ R13, R10
- SUBQ ·modulus+0(SB), R10
- MOVQ R14, R11
- SBBQ ·modulus+8(SB), R11
- MOVQ R15, R12
- SBBQ ·modulus+16(SB), R12
- MOVQ R8, AX
- SBBQ ·modulus+24(SB), AX
- MOVQ DI, BX
- SBBQ ·modulus+32(SB), BX
- MOVQ CX, R9
- SBBQ ·modulus+40(SB), R9
- // |
- /* out */
- MOVQ c+0(FP), SI
- CMOVQCC R10, R13
- MOVQ R13, (SI)
- CMOVQCC R11, R14
- MOVQ R14, 8(SI)
- CMOVQCC R12, R15
- MOVQ R15, 16(SI)
- CMOVQCC AX, R8
- MOVQ R8, 24(SI)
- CMOVQCC BX, DI
- MOVQ DI, 32(SI)
- CMOVQCC R9, CX
- MOVQ CX, 40(SI)
- RET
- // |
- /* end */
- // multiplication
- // c = a * b % p
- TEXT ·mulADX(SB), NOSPLIT, $16-24
- // |
- /* inputs */
- MOVQ a+8(FP), DI
- MOVQ b+16(FP), SI
- XORQ AX, AX
- // |
- /* i0 */
- // | a0 @ DX
- MOVQ (DI), DX
- // | a0 * b0
- MULXQ (SI), AX, CX
- MOVQ AX, (SP)
- // | a0 * b1
- MULXQ 8(SI), AX, R8
- ADCXQ AX, CX
- // | a0 * b2
- MULXQ 16(SI), AX, R9
- ADCXQ AX, R8
- // | a0 * b3
- MULXQ 24(SI), AX, R10
- ADCXQ AX, R9
- // | a0 * b4
- MULXQ 32(SI), AX, R11
- ADCXQ AX, R10
- // | a0 * b5
- MULXQ 40(SI), AX, R12
- ADCXQ AX, R11
- ADCQ $0x00, R12
- // |
- /* i1 */
- // | a1 @ DX
- MOVQ 8(DI), DX
- XORQ R13, R13
- // | a1 * b0
- MULXQ (SI), AX, BX
- ADOXQ AX, CX
- ADCXQ BX, R8
- MOVQ CX, 8(SP)
- // | a1 * b1
- MULXQ 8(SI), AX, BX
- ADOXQ AX, R8
- ADCXQ BX, R9
- // | a1 * b2
- MULXQ 16(SI), AX, BX
- ADOXQ AX, R9
- ADCXQ BX, R10
- // | a1 * b3
- MULXQ 24(SI), AX, BX
- ADOXQ AX, R10
- ADCXQ BX, R11
- // | a1 * b4
- MULXQ 32(SI), AX, BX
- ADOXQ AX, R11
- ADCXQ BX, R12
- // | a1 * b5
- MULXQ 40(SI), AX, BX
- ADOXQ AX, R12
- ADOXQ R13, R13
- ADCXQ BX, R13
- // |
- /* i2 */
- // | a2 @ DX
- MOVQ 16(DI), DX
- XORQ R14, R14
- // | a2 * b0
- MULXQ (SI), AX, BX
- ADOXQ AX, R8
- ADCXQ BX, R9
- // | a2 * b1
- MULXQ 8(SI), AX, BX
- ADOXQ AX, R9
- ADCXQ BX, R10
- // | a2 * b2
- MULXQ 16(SI), AX, BX
- ADOXQ AX, R10
- ADCXQ BX, R11
- // | a2 * b3
- MULXQ 24(SI), AX, BX
- ADOXQ AX, R11
- ADCXQ BX, R12
- // | a2 * b4
- MULXQ 32(SI), AX, BX
- ADOXQ AX, R12
- ADCXQ BX, R13
- // | a2 * b5
- MULXQ 40(SI), AX, BX
- ADOXQ AX, R13
- ADOXQ R14, R14
- ADCXQ BX, R14
- // |
- /* i3 */
- // | a3 @ DX
- MOVQ 24(DI), DX
- XORQ R15, R15
- // | a3 * b0
- MULXQ (SI), AX, BX
- ADOXQ AX, R9
- ADCXQ BX, R10
- // | a3 * b1
- MULXQ 8(SI), AX, BX
- ADOXQ AX, R10
- ADCXQ BX, R11
- // | a3 * b2
- MULXQ 16(SI), AX, BX
- ADOXQ AX, R11
- ADCXQ BX, R12
- // | a3 * b3
- MULXQ 24(SI), AX, BX
- ADOXQ AX, R12
- ADCXQ BX, R13
- // | a3 * b4
- MULXQ 32(SI), AX, BX
- ADOXQ AX, R13
- ADCXQ BX, R14
- // | a3 * b5
- MULXQ 40(SI), AX, BX
- ADOXQ AX, R14
- ADOXQ R15, R15
- ADCXQ BX, R15
- // |
- /* i4 */
- // | a4 @ DX
- MOVQ 32(DI), DX
- XORQ CX, CX
- // | a4 * b0
- MULXQ (SI), AX, BX
- ADOXQ AX, R10
- ADCXQ BX, R11
- // | a4 * b1
- MULXQ 8(SI), AX, BX
- ADOXQ AX, R11
- ADCXQ BX, R12
- // | a4 * b2
- MULXQ 16(SI), AX, BX
- ADOXQ AX, R12
- ADCXQ BX, R13
- // | a4 * b3
- MULXQ 24(SI), AX, BX
- ADOXQ AX, R13
- ADCXQ BX, R14
- // | a4 * b4
- MULXQ 32(SI), AX, BX
- ADOXQ AX, R14
- ADCXQ BX, R15
- // | a4 * b5
- MULXQ 40(SI), AX, BX
- ADOXQ AX, R15
- ADOXQ CX, CX
- ADCXQ BX, CX
- // |
- /* i5 */
- // | a5 @ DX
- MOVQ 40(DI), DX
- XORQ DI, DI
- // | a5 * b0
- MULXQ (SI), AX, BX
- ADOXQ AX, R11
- ADCXQ BX, R12
- // | a5 * b1
- MULXQ 8(SI), AX, BX
- ADOXQ AX, R12
- ADCXQ BX, R13
- // | a5 * b2
- MULXQ 16(SI), AX, BX
- ADOXQ AX, R13
- ADCXQ BX, R14
- // | a5 * b3
- MULXQ 24(SI), AX, BX
- ADOXQ AX, R14
- ADCXQ BX, R15
- // | a5 * b4
- MULXQ 32(SI), AX, BX
- ADOXQ AX, R15
- ADCXQ BX, CX
- // | a5 * b5
- MULXQ 40(SI), AX, BX
- ADOXQ AX, CX
- ADOXQ BX, DI
- ADCQ $0x00, DI
- // |
- /* */
- // |
- // | W
- // | 0 (SP) | 1 8(SP) | 2 R8 | 3 R9 | 4 R10 | 5 R11
- // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 CX | 11 DI
- MOVQ (SP), BX
- MOVQ 8(SP), SI
- MOVQ DI, (SP)
- // |
- // | W ready to mont
- // | 0 BX | 1 SI | 2 R8 | 3 R9 | 4 R10 | 5 R11
- // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 CX | 11 (SP)
- // |
- /* montgomery reduction */
- // | clear flags
- XORQ AX, AX
- // |
- /* i0 */
- // |
- // | W
- // | 0 BX | 1 SI | 2 R8 | 3 R9 | 4 R10 | 5 R11
- // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 CX | 11 (SP)
- // | | u0 = w0 * inp
- MOVQ BX, DX
- MULXQ ·inp+0(SB), DX, DI
- // |
- /* */
- // | j0
- // | w0 @ BX
- MULXQ ·modulus+0(SB), AX, DI
- ADOXQ AX, BX
- ADCXQ DI, SI
- // | j1
- // | w1 @ SI
- MULXQ ·modulus+8(SB), AX, DI
- ADOXQ AX, SI
- ADCXQ DI, R8
- // | j2
- // | w2 @ R8
- MULXQ ·modulus+16(SB), AX, DI
- ADOXQ AX, R8
- ADCXQ DI, R9
- // | j3
- // | w3 @ R9
- MULXQ ·modulus+24(SB), AX, DI
- ADOXQ AX, R9
- ADCXQ DI, R10
- // | j4
- // | w4 @ R10
- MULXQ ·modulus+32(SB), AX, DI
- ADOXQ AX, R10
- ADCXQ DI, R11
- // | j5
- // | w5 @ R11
- MULXQ ·modulus+40(SB), AX, DI
- ADOXQ AX, R11
- ADCXQ DI, R12
- ADOXQ BX, R12
- ADCXQ BX, BX
- MOVQ $0x00, AX
- ADOXQ AX, BX
- // | clear flags
- XORQ AX, AX
- // |
- /* i1 */
- // |
- // | W
- // | 0 - | 1 SI | 2 R8 | 3 R9 | 4 R10 | 5 R11
- // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 CX | 11 (SP)
- // | | u1 = w1 * inp
- MOVQ SI, DX
- MULXQ ·inp+0(SB), DX, DI
- // |
- /* */
- // | j0
- // | w1 @ SI
- MULXQ ·modulus+0(SB), AX, DI
- ADOXQ AX, SI
- ADCXQ DI, R8
- // | j1
- // | w2 @ R8
- MULXQ ·modulus+8(SB), AX, DI
- ADOXQ AX, R8
- ADCXQ DI, R9
- // | j2
- // | w3 @ R9
- MULXQ ·modulus+16(SB), AX, DI
- ADOXQ AX, R9
- ADCXQ DI, R10
- // | j3
- // | w4 @ R10
- MULXQ ·modulus+24(SB), AX, DI
- ADOXQ AX, R10
- ADCXQ DI, R11
- // | j4
- // | w5 @ R11
- MULXQ ·modulus+32(SB), AX, DI
- ADOXQ AX, R11
- ADCXQ DI, R12
- // | j5
- // | w6 @ R12
- MULXQ ·modulus+40(SB), AX, DI
- ADOXQ AX, R12
- ADCXQ DI, R13
- ADOXQ BX, R13
- ADCXQ SI, SI
- MOVQ $0x00, AX
- ADOXQ AX, SI
- // | clear flags
- XORQ AX, AX
- // |
- /* i2 */
- // |
- // | W
- // | 0 - | 1 - | 2 R8 | 3 R9 | 4 R10 | 5 R11
- // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 CX | 11 (SP)
- // | | u2 = w2 * inp
- MOVQ R8, DX
- MULXQ ·inp+0(SB), DX, DI
- // |
- /* */
- // | j0
- // | w2 @ R8
- MULXQ ·modulus+0(SB), AX, DI
- ADOXQ AX, R8
- ADCXQ DI, R9
- // | j1
- // | w3 @ R9
- MULXQ ·modulus+8(SB), AX, DI
- ADOXQ AX, R9
- ADCXQ DI, R10
- // | j2
- // | w4 @ R10
- MULXQ ·modulus+16(SB), AX, DI
- ADOXQ AX, R10
- ADCXQ DI, R11
- // | j3
- // | w5 @ R11
- MULXQ ·modulus+24(SB), AX, DI
- ADOXQ AX, R11
- ADCXQ DI, R12
- // | j4
- // | w6 @ R12
- MULXQ ·modulus+32(SB), AX, DI
- ADOXQ AX, R12
- ADCXQ DI, R13
- // | j5
- // | w7 @ R13
- MULXQ ·modulus+40(SB), AX, DI
- ADOXQ AX, R13
- ADCXQ DI, R14
- ADOXQ SI, R14
- ADCXQ R8, R8
- MOVQ $0x00, AX
- ADOXQ AX, R8
- // | clear flags
- XORQ AX, AX
- // |
- /* i3 */
- // |
- // | W
- // | 0 - | 1 - | 2 - | 3 R9 | 4 R10 | 5 R11
- // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 CX | 11 (SP)
- // | | u3 = w3 * inp
- MOVQ R9, DX
- MULXQ ·inp+0(SB), DX, DI
- // |
- /* */
- // | j0
- // | w3 @ R9
- MULXQ ·modulus+0(SB), AX, DI
- ADOXQ AX, R9
- ADCXQ DI, R10
- // | j1
- // | w4 @ R10
- MULXQ ·modulus+8(SB), AX, DI
- ADOXQ AX, R10
- ADCXQ DI, R11
- // | j2
- // | w5 @ R11
- MULXQ ·modulus+16(SB), AX, DI
- ADOXQ AX, R11
- ADCXQ DI, R12
- // | j3
- // | w6 @ R12
- MULXQ ·modulus+24(SB), AX, DI
- ADOXQ AX, R12
- ADCXQ DI, R13
- // | j4
- // | w7 @ R13
- MULXQ ·modulus+32(SB), AX, DI
- ADOXQ AX, R13
- ADCXQ DI, R14
- // | j5
- // | w8 @ R14
- MULXQ ·modulus+40(SB), AX, DI
- ADOXQ AX, R14
- ADCXQ DI, R15
- ADOXQ R8, R15
- ADCXQ R9, R9
- MOVQ $0x00, AX
- ADOXQ AX, R9
- // | clear flags
- XORQ AX, AX
- // |
- /* i4 */
- // |
- // | W
- // | 0 - | 1 - | 2 - | 3 - | 4 R10 | 5 R11
- // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 CX | 11 (SP)
- // | | u4 = w4 * inp
- MOVQ R10, DX
- MULXQ ·inp+0(SB), DX, DI
- // |
- /* */
- // | j0
- // | w4 @ R10
- MULXQ ·modulus+0(SB), AX, DI
- ADOXQ AX, R10
- ADCXQ DI, R11
- // | j1
- // | w5 @ R11
- MULXQ ·modulus+8(SB), AX, DI
- ADOXQ AX, R11
- ADCXQ DI, R12
- // | j2
- // | w6 @ R12
- MULXQ ·modulus+16(SB), AX, DI
- ADOXQ AX, R12
- ADCXQ DI, R13
- // | j3
- // | w7 @ R13
- MULXQ ·modulus+24(SB), AX, DI
- ADOXQ AX, R13
- ADCXQ DI, R14
- // | j4
- // | w8 @ R14
- MULXQ ·modulus+32(SB), AX, DI
- ADOXQ AX, R14
- ADCXQ DI, R15
- // | j5
- // | w9 @ R15
- MULXQ ·modulus+40(SB), AX, DI
- ADOXQ AX, R15
- ADCXQ DI, CX
- ADOXQ R9, CX
- ADCXQ R10, R10
- MOVQ $0x00, AX
- ADOXQ AX, R10
- // | clear flags
- XORQ AX, AX
- // |
- /* i5 */
- // |
- // | W
- // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 R11
- // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 CX | 11 (SP)
- // | | u5 = w5 * inp
- MOVQ R11, DX
- MULXQ ·inp+0(SB), DX, DI
- // |
- /* */
- // | j0
- // | w5 @ R11
- MULXQ ·modulus+0(SB), AX, DI
- ADOXQ AX, R11
- ADCXQ DI, R12
- // | j1
- // | w6 @ R12
- MULXQ ·modulus+8(SB), AX, DI
- ADOXQ AX, R12
- ADCXQ DI, R13
- // | j2
- // | w7 @ R13
- MULXQ ·modulus+16(SB), AX, DI
- ADOXQ AX, R13
- ADCXQ DI, R14
- // | j3
- // | w8 @ R14
- MULXQ ·modulus+24(SB), AX, DI
- ADOXQ AX, R14
- ADCXQ DI, R15
- // | j4
- // | w9 @ R15
- MULXQ ·modulus+32(SB), AX, DI
- ADOXQ AX, R15
- ADCXQ DI, CX
- // | j5
- // | w10 @ CX
- MULXQ ·modulus+40(SB), AX, DI
- ADOXQ AX, CX
- // | w11 @ (SP)
- // | move to an idle register
- MOVQ (SP), BX
- ADCXQ DI, BX
- ADOXQ R10, BX
- // |
- // | W montgomery reduction ends
- // | 0 - | 1 - | 2 - | 3 - | 4 - | 5 -
- // | 6 R12 | 7 R13 | 8 R14 | 9 R15 | 10 CX | 11 BX
- // |
- /* modular reduction */
- MOVQ R12, AX
- SUBQ ·modulus+0(SB), AX
- MOVQ R13, DI
- SBBQ ·modulus+8(SB), DI
- MOVQ R14, SI
- SBBQ ·modulus+16(SB), SI
- MOVQ R15, R8
- SBBQ ·modulus+24(SB), R8
- MOVQ CX, R9
- SBBQ ·modulus+32(SB), R9
- MOVQ BX, R10
- SBBQ ·modulus+40(SB), R10
- // |
- /* out */
- MOVQ c+0(FP), R11
- CMOVQCC AX, R12
- MOVQ R12, (R11)
- CMOVQCC DI, R13
- MOVQ R13, 8(R11)
- CMOVQCC SI, R14
- MOVQ R14, 16(R11)
- CMOVQCC R8, R15
- MOVQ R15, 24(R11)
- CMOVQCC R9, CX
- MOVQ CX, 32(R11)
- CMOVQCC R10, BX
- MOVQ BX, 40(R11)
- RET
- // |
- /* end */
|