mul_arm64.h 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. #define mul(c0,c1,c2,c3,c4,c5,c6,c7) \
  2. MUL R1, R5, c0 \
  3. UMULH R1, R5, c1 \
  4. MUL R1, R6, R0 \
  5. ADDS R0, c1 \
  6. UMULH R1, R6, c2 \
  7. MUL R1, R7, R0 \
  8. ADCS R0, c2 \
  9. UMULH R1, R7, c3 \
  10. MUL R1, R8, R0 \
  11. ADCS R0, c3 \
  12. UMULH R1, R8, c4 \
  13. ADCS ZR, c4 \
  14. \
  15. MUL R2, R5, R1 \
  16. UMULH R2, R5, R26 \
  17. MUL R2, R6, R0 \
  18. ADDS R0, R26 \
  19. UMULH R2, R6, R27 \
  20. MUL R2, R7, R0 \
  21. ADCS R0, R27 \
  22. UMULH R2, R7, R29 \
  23. MUL R2, R8, R0 \
  24. ADCS R0, R29 \
  25. UMULH R2, R8, c5 \
  26. ADCS ZR, c5 \
  27. ADDS R1, c1 \
  28. ADCS R26, c2 \
  29. ADCS R27, c3 \
  30. ADCS R29, c4 \
  31. ADCS ZR, c5 \
  32. \
  33. MUL R3, R5, R1 \
  34. UMULH R3, R5, R26 \
  35. MUL R3, R6, R0 \
  36. ADDS R0, R26 \
  37. UMULH R3, R6, R27 \
  38. MUL R3, R7, R0 \
  39. ADCS R0, R27 \
  40. UMULH R3, R7, R29 \
  41. MUL R3, R8, R0 \
  42. ADCS R0, R29 \
  43. UMULH R3, R8, c6 \
  44. ADCS ZR, c6 \
  45. ADDS R1, c2 \
  46. ADCS R26, c3 \
  47. ADCS R27, c4 \
  48. ADCS R29, c5 \
  49. ADCS ZR, c6 \
  50. \
  51. MUL R4, R5, R1 \
  52. UMULH R4, R5, R26 \
  53. MUL R4, R6, R0 \
  54. ADDS R0, R26 \
  55. UMULH R4, R6, R27 \
  56. MUL R4, R7, R0 \
  57. ADCS R0, R27 \
  58. UMULH R4, R7, R29 \
  59. MUL R4, R8, R0 \
  60. ADCS R0, R29 \
  61. UMULH R4, R8, c7 \
  62. ADCS ZR, c7 \
  63. ADDS R1, c3 \
  64. ADCS R26, c4 \
  65. ADCS R27, c5 \
  66. ADCS R29, c6 \
  67. ADCS ZR, c7
  68. #define gfpReduce() \
  69. \ // m = (T * N') mod R, store m in R1:R2:R3:R4
  70. MOVD ·np+0(SB), R17 \
  71. MOVD ·np+8(SB), R25 \
  72. MOVD ·np+16(SB), R19 \
  73. MOVD ·np+24(SB), R20 \
  74. \
  75. MUL R9, R17, R1 \
  76. UMULH R9, R17, R2 \
  77. MUL R9, R25, R0 \
  78. ADDS R0, R2 \
  79. UMULH R9, R25, R3 \
  80. MUL R9, R19, R0 \
  81. ADCS R0, R3 \
  82. UMULH R9, R19, R4 \
  83. MUL R9, R20, R0 \
  84. ADCS R0, R4 \
  85. \
  86. MUL R10, R17, R21 \
  87. UMULH R10, R17, R22 \
  88. MUL R10, R25, R0 \
  89. ADDS R0, R22 \
  90. UMULH R10, R25, R23 \
  91. MUL R10, R19, R0 \
  92. ADCS R0, R23 \
  93. ADDS R21, R2 \
  94. ADCS R22, R3 \
  95. ADCS R23, R4 \
  96. \
  97. MUL R11, R17, R21 \
  98. UMULH R11, R17, R22 \
  99. MUL R11, R25, R0 \
  100. ADDS R0, R22 \
  101. ADDS R21, R3 \
  102. ADCS R22, R4 \
  103. \
  104. MUL R12, R17, R21 \
  105. ADDS R21, R4 \
  106. \
  107. \ // m * N
  108. loadModulus(R5,R6,R7,R8) \
  109. mul(R17,R25,R19,R20,R21,R22,R23,R24) \
  110. \
  111. \ // Add the 512-bit intermediate to m*N
  112. MOVD ZR, R0 \
  113. ADDS R9, R17 \
  114. ADCS R10, R25 \
  115. ADCS R11, R19 \
  116. ADCS R12, R20 \
  117. ADCS R13, R21 \
  118. ADCS R14, R22 \
  119. ADCS R15, R23 \
  120. ADCS R16, R24 \
  121. ADCS ZR, R0 \
  122. \
  123. \ // Our output is R21:R22:R23:R24. Reduce mod p if necessary.
  124. SUBS R5, R21, R10 \
  125. SBCS R6, R22, R11 \
  126. SBCS R7, R23, R12 \
  127. SBCS R8, R24, R13 \
  128. \
  129. CSEL CS, R10, R21, R1 \
  130. CSEL CS, R11, R22, R2 \
  131. CSEL CS, R12, R23, R3 \
  132. CSEL CS, R13, R24, R4