mul_amd64.h 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. #define mul(a0,a1,a2,a3, rb, stack) \
  2. MOVQ a0, AX \
  3. MULQ 0+rb \
  4. MOVQ AX, R8 \
  5. MOVQ DX, R9 \
  6. MOVQ a0, AX \
  7. MULQ 8+rb \
  8. ADDQ AX, R9 \
  9. ADCQ $0, DX \
  10. MOVQ DX, R10 \
  11. MOVQ a0, AX \
  12. MULQ 16+rb \
  13. ADDQ AX, R10 \
  14. ADCQ $0, DX \
  15. MOVQ DX, R11 \
  16. MOVQ a0, AX \
  17. MULQ 24+rb \
  18. ADDQ AX, R11 \
  19. ADCQ $0, DX \
  20. MOVQ DX, R12 \
  21. \
  22. storeBlock(R8,R9,R10,R11, 0+stack) \
  23. MOVQ R12, 32+stack \
  24. \
  25. MOVQ a1, AX \
  26. MULQ 0+rb \
  27. MOVQ AX, R8 \
  28. MOVQ DX, R9 \
  29. MOVQ a1, AX \
  30. MULQ 8+rb \
  31. ADDQ AX, R9 \
  32. ADCQ $0, DX \
  33. MOVQ DX, R10 \
  34. MOVQ a1, AX \
  35. MULQ 16+rb \
  36. ADDQ AX, R10 \
  37. ADCQ $0, DX \
  38. MOVQ DX, R11 \
  39. MOVQ a1, AX \
  40. MULQ 24+rb \
  41. ADDQ AX, R11 \
  42. ADCQ $0, DX \
  43. MOVQ DX, R12 \
  44. \
  45. ADDQ 8+stack, R8 \
  46. ADCQ 16+stack, R9 \
  47. ADCQ 24+stack, R10 \
  48. ADCQ 32+stack, R11 \
  49. ADCQ $0, R12 \
  50. storeBlock(R8,R9,R10,R11, 8+stack) \
  51. MOVQ R12, 40+stack \
  52. \
  53. MOVQ a2, AX \
  54. MULQ 0+rb \
  55. MOVQ AX, R8 \
  56. MOVQ DX, R9 \
  57. MOVQ a2, AX \
  58. MULQ 8+rb \
  59. ADDQ AX, R9 \
  60. ADCQ $0, DX \
  61. MOVQ DX, R10 \
  62. MOVQ a2, AX \
  63. MULQ 16+rb \
  64. ADDQ AX, R10 \
  65. ADCQ $0, DX \
  66. MOVQ DX, R11 \
  67. MOVQ a2, AX \
  68. MULQ 24+rb \
  69. ADDQ AX, R11 \
  70. ADCQ $0, DX \
  71. MOVQ DX, R12 \
  72. \
  73. ADDQ 16+stack, R8 \
  74. ADCQ 24+stack, R9 \
  75. ADCQ 32+stack, R10 \
  76. ADCQ 40+stack, R11 \
  77. ADCQ $0, R12 \
  78. storeBlock(R8,R9,R10,R11, 16+stack) \
  79. MOVQ R12, 48+stack \
  80. \
  81. MOVQ a3, AX \
  82. MULQ 0+rb \
  83. MOVQ AX, R8 \
  84. MOVQ DX, R9 \
  85. MOVQ a3, AX \
  86. MULQ 8+rb \
  87. ADDQ AX, R9 \
  88. ADCQ $0, DX \
  89. MOVQ DX, R10 \
  90. MOVQ a3, AX \
  91. MULQ 16+rb \
  92. ADDQ AX, R10 \
  93. ADCQ $0, DX \
  94. MOVQ DX, R11 \
  95. MOVQ a3, AX \
  96. MULQ 24+rb \
  97. ADDQ AX, R11 \
  98. ADCQ $0, DX \
  99. MOVQ DX, R12 \
  100. \
  101. ADDQ 24+stack, R8 \
  102. ADCQ 32+stack, R9 \
  103. ADCQ 40+stack, R10 \
  104. ADCQ 48+stack, R11 \
  105. ADCQ $0, R12 \
  106. storeBlock(R8,R9,R10,R11, 24+stack) \
  107. MOVQ R12, 56+stack
  108. #define gfpReduce(stack) \
  109. \ // m = (T * N') mod R, store m in R8:R9:R10:R11
  110. MOVQ ·np+0(SB), AX \
  111. MULQ 0+stack \
  112. MOVQ AX, R8 \
  113. MOVQ DX, R9 \
  114. MOVQ ·np+0(SB), AX \
  115. MULQ 8+stack \
  116. ADDQ AX, R9 \
  117. ADCQ $0, DX \
  118. MOVQ DX, R10 \
  119. MOVQ ·np+0(SB), AX \
  120. MULQ 16+stack \
  121. ADDQ AX, R10 \
  122. ADCQ $0, DX \
  123. MOVQ DX, R11 \
  124. MOVQ ·np+0(SB), AX \
  125. MULQ 24+stack \
  126. ADDQ AX, R11 \
  127. \
  128. MOVQ ·np+8(SB), AX \
  129. MULQ 0+stack \
  130. MOVQ AX, R12 \
  131. MOVQ DX, R13 \
  132. MOVQ ·np+8(SB), AX \
  133. MULQ 8+stack \
  134. ADDQ AX, R13 \
  135. ADCQ $0, DX \
  136. MOVQ DX, R14 \
  137. MOVQ ·np+8(SB), AX \
  138. MULQ 16+stack \
  139. ADDQ AX, R14 \
  140. \
  141. ADDQ R12, R9 \
  142. ADCQ R13, R10 \
  143. ADCQ R14, R11 \
  144. \
  145. MOVQ ·np+16(SB), AX \
  146. MULQ 0+stack \
  147. MOVQ AX, R12 \
  148. MOVQ DX, R13 \
  149. MOVQ ·np+16(SB), AX \
  150. MULQ 8+stack \
  151. ADDQ AX, R13 \
  152. \
  153. ADDQ R12, R10 \
  154. ADCQ R13, R11 \
  155. \
  156. MOVQ ·np+24(SB), AX \
  157. MULQ 0+stack \
  158. ADDQ AX, R11 \
  159. \
  160. storeBlock(R8,R9,R10,R11, 64+stack) \
  161. \
  162. \ // m * N
  163. mul(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64+stack, 96+stack) \
  164. \
  165. \ // Add the 512-bit intermediate to m*N
  166. loadBlock(96+stack, R8,R9,R10,R11) \
  167. loadBlock(128+stack, R12,R13,R14,R15) \
  168. \
  169. MOVQ $0, AX \
  170. ADDQ 0+stack, R8 \
  171. ADCQ 8+stack, R9 \
  172. ADCQ 16+stack, R10 \
  173. ADCQ 24+stack, R11 \
  174. ADCQ 32+stack, R12 \
  175. ADCQ 40+stack, R13 \
  176. ADCQ 48+stack, R14 \
  177. ADCQ 56+stack, R15 \
  178. ADCQ $0, AX \
  179. \
  180. gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX)