Floating Point Operations
PhD Student
Fall 2017
ARMv8 has 32 floating point registers (sort of, they're actually 128bit, but ignore that for now)
An example function...
double approximate(double x, double y, int iterations) {
// d0 contains x
// d1 contains y
// w0 contains iterations
// do some calculations
// return value goes in d0
}
double approximate(int iterations, double x, double y) {
// d0 contains x
// d1 contains y
// w0 contains iterations
// do some calculations
// return value goes in d0
}
An example function...
int approximate(double x, double y, int iterations) {
// d0 contains x
// d1 contains y
// w0 contains iterations
// do some calculations
// return value goes in w0
}
int approximate(int iterations, double x, double y) {
// d0 contains x
// d1 contains y
// w0 contains iterations
// do some calculations
// return value goes in w0
}
An example...
Recall the taylor series of an infinitely differentiable function (i.e. holomorphic or analytic function)
Or its Maclaurin expansion
For \(e^x\) we have
We can stop after some amount of terms in the sum
In a machine, eventually we start to lose precision, so we stop after the terms stop decreasing the approximation beyond a threshold
#include <math.h>
#include <stdio.h>
double taylor_exp(double x) {
double n = 0;
double nfact = 1;
double exp, exp_old;
double xx = 1;
exp = 1.0;
do {
n += 1.0;
nfact *= n;
xx *= x;
exp_old = exp;
exp += xx/nfact;
} while(fabs(exp_old - exp) > 1e-10);
return exp;
}
int main(int argc, char *argv[]) {
printf("%f\n", taylor_exp(0.0));
printf("%f\n", taylor_exp(1.0));
printf("%f\n", taylor_exp(2.0));
}
.data
fmt1: .string "%f\n"
threshold: .double 0r1e-10 // A double prec. constant
.text
.balign 4
.global main
main:
stp x29, x30, [sp, -16]!
mov x29, sp
fmov d0, xzr
bl taylor_exp
adrp x0, fmt1
add x0, x0, :lo12:fmt1
bl printf
fmov d0, 1.0
bl taylor_exp
adrp x0, fmt1
add x0, x0, :lo12:fmt1
bl printf
fmov d0,2.0
bl taylor_exp
adrp x0, fmt1
add x0, x0, :lo12:fmt1
bl printf
ldp x29, x30, [sp], 16
ret
taylor_exp: // d0 is the first argument
n_s = 16
nfact_s = 24
exp_s = 32
exp_old_s = 40
xx_s = 48
stp x29, x30, [sp, -64]! // Standard stack setup/teardown
mov x29, sp
fmov d1, xzr // Convert 0 to a double
str d1, [x29, n_s] // n = 0.0
fmov d1, 1.0
str d1, [x29, nfact_s] // nfact = 1.0
str d1, [x29, xx_s] // xx = 1.0
str d1, [x29, exp_s] // exp = 1.0
exp_do_top:
ldr d1, [x29, n_s] // loading and storing to addrs works just as it does for x
fmov d2, 1.0
fadd d1, d1, d2 // d1 = d1 + 1
str d1, [x29, n_s] // n = d1
// Build up the factorial over each iteration of the loop
ldr d2, [x29, nfact_s]
fmul d2, d2, d1
str d2, [x29, nfact_s] // nfact = nfact * n = n * (n-1)!
ldr d3, [x29, xx_s]
fmul d3, d0, d3
str d3, [x29, xx_s] // xx = x*xx = x^n
ldr d4, [x29, exp_s]
str d4, [x29, exp_old_s] // exp_old = exp
fdiv d2, d3, d2 // d2 = x^n / nfact
fadd d5, d4, d2 // d5 = exp + x^n/nfact
str d5, [x29, exp_s] // exp = d5
ldr d1, [x29, exp_s]
ldr d2, [x29, exp_old_s]
fsub d1, d1, d2 // d1 = exp - exp_old
fabs d1, d1 // d1 = fabs(exp - exp_old)
adrp x0, threshold // Load the addr of the threshold
add x0, x0, :lo12: threshold
ldr d2, [x0] // load threshold value into d2
fcmp d1, d2 // fabs(exp - exp_old) cmp threshold
b.gt exp_do_top // if it still contributes to the upper 10 digits, keep going
ldr d0, [x29, exp_s] // load ~exp(x) into d0 (return value)
ldp x29, x30, [sp], 64 // stack teardown
ret
Work Day