source/luametatex/source/libraries/libcerf/w_of_z.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393

/* Library libcerf:
 *   Compute complex error functions, based on a new implementation of
 *   Faddeeva's w_of_z. Also provide Dawson and Voigt functions.
 *
 * File w_of_z.c:
 *   Computation of Faddeeva's complex scaled error function,
 *      w(z) = exp(-z^2) * erfc(-i*z),
 *   nameless function (7.1.3) of Abramowitz&Stegun (1964),
 *   also known as the plasma dispersion function.
 *
 *   This implementation uses a combination of different algorithms.
 *   See man 3 w_of_z for references.
 *
 * Copyright:
 *   (C) 2012 Massachusetts Institute of Technology
 *   (C) 2013 Forschungszentrum Jülich GmbH
 *
 * Licence:
 *   Permission is hereby granted, free of charge, to any person obtaining
 *   a copy of this software and associated documentation files (the
 *   "Software"), to deal in the Software without restriction, including
 *   without limitation the rights to use, copy, modify, merge, publish,
 *   distribute, sublicense, and/or sell copies of the Software, and to
 *   permit persons to whom the Software is furnished to do so, subject to
 *   the following conditions:
 *
 *   The above copyright notice and this permission notice shall be
 *   included in all copies or substantial portions of the Software.
 *
 *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 *   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 *   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 *   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 *   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 *   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 *   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * Authors:
 *   Steven G. Johnson, Massachusetts Institute of Technology, 2012, core author
 *   Joachim Wuttke, Forschungszentrum Jülich, 2013, package maintainer
 *
 * Website:
 *   http://apps.jcns.fz-juelich.de/libcerf
 *
 * Revision history:
 *   ../CHANGELOG
 *
 * Man page:
 *   w_of_z(3)
 */

/*

    Todo: use local declarations (older compilers) (HH).  

*/

/*
   Computes various error functions (erf, erfc, erfi, erfcx),
   including the Dawson integral, in the complex plane, based
   on algorithms for the computation of the Faddeeva function
              w(z) = exp(-z^2) * erfc(-i*z).
   Given w(z), the error functions are mostly straightforward
   to compute, except for certain regions where we have to
   switch to Taylor expansions to avoid cancellation errors
   [e.g. near the origin for erf(z)].

*/

#include "cerf.h"
#include <float.h>
#include <math.h>
#include "defs.h" // defines _cerf_cmplx, NaN, C, cexp, ...

// for analysing the algorithm:
EXPORT int faddeeva_algorithm;
EXPORT int faddeeva_nofterms;

/******************************************************************************/
/*  auxiliary functions                                                       */
/******************************************************************************/

static inline double sinc(double x, double sinx)
{
    // return sinc(x) = sin(x)/x, given both x and sin(x)
    // [since we only use this in cases where sin(x) has already been computed]
    return fabs(x) < 1e-4 ? 1 - (0.1666666666666666666667)*x*x : sinx / x;
}

static inline double sinh_taylor(double x)
{
    // sinh(x) via Taylor series, accurate to machine precision for |x| < 1e-2
    return x * (1 + (x*x) * (0.1666666666666666666667 + 0.00833333333333333333333 * (x*x)));
}

static inline double sqr(double x) { return x*x; }

/******************************************************************************/
/* precomputed table of expa2n2[n-1] = exp(-a2*n*n)                           */
/* for double-precision a2 = 0.26865... in w_of_z, below.                     */
/******************************************************************************/

static const double expa2n2[] = {
    7.64405281671221563e-01,
    3.41424527166548425e-01,
    8.91072646929412548e-02,
    1.35887299055460086e-02,
    1.21085455253437481e-03,
    6.30452613933449404e-05,
    1.91805156577114683e-06,
    3.40969447714832381e-08,
    3.54175089099469393e-10,
    2.14965079583260682e-12,
    7.62368911833724354e-15,
    1.57982797110681093e-17,
    1.91294189103582677e-20,
    1.35344656764205340e-23,
    5.59535712428588720e-27,
    1.35164257972401769e-30,
    1.90784582843501167e-34,
    1.57351920291442930e-38,
    7.58312432328032845e-43,
    2.13536275438697082e-47,
    3.51352063787195769e-52,
    3.37800830266396920e-57,
    1.89769439468301000e-62,
    6.22929926072668851e-68,
    1.19481172006938722e-73,
    1.33908181133005953e-79,
    8.76924303483223939e-86,
    3.35555576166254986e-92,
    7.50264110688173024e-99,
    9.80192200745410268e-106,
    7.48265412822268959e-113,
    3.33770122566809425e-120,
    8.69934598159861140e-128,
    1.32486951484088852e-135,
    1.17898144201315253e-143,
    6.13039120236180012e-152,
    1.86258785950822098e-160,
    3.30668408201432783e-169,
    3.43017280887946235e-178,
    2.07915397775808219e-187,
    7.36384545323984966e-197,
    1.52394760394085741e-206,
    1.84281935046532100e-216,
    1.30209553802992923e-226,
    5.37588903521080531e-237,
    1.29689584599763145e-247,
    1.82813078022866562e-258,
    1.50576355348684241e-269,
    7.24692320799294194e-281,
    2.03797051314726829e-292,
    3.34880215927873807e-304,
    0.0 // underflow (also prevents reads past array end, below)
}; // expa2n2

/******************************************************************************/
/*  w_of_z, Faddeeva's scaled complex error function                          */
/******************************************************************************/

_cerf_cmplx w_of_z(_cerf_cmplx z)
{
    faddeeva_nofterms = 0;

    // Steven G. Johnson, October 2012.

    if (creal(z) == 0.0) {
        // Purely imaginary input, purely real output.
        // However, use creal(z) to give correct sign of 0 in cimag(w).
        return C(erfcx(cimag(z)), creal(z));
    }
    if (cimag(z) == 0) {
        // Purely real input, complex output.
        return C(exp(-sqr(creal(z))),  im_w_of_x(creal(z)));
    }

    const double relerr = DBL_EPSILON;
    const double a = 0.518321480430085929872; // pi / sqrt(-log(eps*0.5))
    const double c = 0.329973702884629072537; // (2/pi) * a;
    const double a2 = 0.268657157075235951582; // a^2

    const double x = fabs(creal(z));
    const double y = cimag(z);
    const double ya = fabs(y);

    _cerf_cmplx ret = C(0., 0.); // return value

    double sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0, sum5 = 0;

    if (ya > 7 || (x > 6  // continued fraction is faster
                   /* As pointed out by M. Zaghloul, the continued
                      fraction seems to give a large relative error in
                      Re w(z) for |x| ~ 6 and small |y|, so use
                      algorithm 816 in this region: */
                   && (ya > 0.1 || (x > 8 && ya > 1e-10) || x > 28))) {

        faddeeva_algorithm = 100;

        /* Poppe & Wijers suggest using a number of terms
           nu = 3 + 1442 / (26*rho + 77)
           where rho = sqrt((x/x0)^2 + (y/y0)^2) where x0=6.3, y0=4.4.
           (They only use this expansion for rho >= 1, but rho a little less
           than 1 seems okay too.)
           Instead, I did my own fit to a slightly different function
           that avoids the hypotenuse calculation, using NLopt to minimize
           the sum of the squares of the errors in nu with the constraint
           that the estimated nu be >= minimum nu to attain machine precision.
           I also separate the regions where nu == 2 and nu == 1. */
        const double ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi)
        double xs = y < 0 ? -creal(z) : creal(z); // compute for -z if y < 0
        if (x + ya > 4000) { // nu <= 2
            if (x + ya > 1e7) { // nu == 1, w(z) = i/sqrt(pi) / z
                // scale to avoid overflow
                if (x > ya) {
                    faddeeva_algorithm += 1;
                    double yax = ya / xs;
                    faddeeva_algorithm = 100;
                    double denom = ispi / (xs + yax*ya);
                    ret = C(denom*yax, denom);
                }
                else if (isinf(ya)) {
                    faddeeva_algorithm += 2;
                    return ((isnan(x) || y < 0)
                            ? C(NaN,NaN) : C(0,0));
                }
                else {
                    faddeeva_algorithm += 3;
                    double xya = xs / ya;
                    double denom = ispi / (xya*xs + ya);
                    ret = C(denom, denom*xya);
                }
            }
            else { // nu == 2, w(z) = i/sqrt(pi) * z / (z*z - 0.5)
                faddeeva_algorithm += 4;
                double dr = xs*xs - ya*ya - 0.5, di = 2*xs*ya;
                double denom = ispi / (dr*dr + di*di);
                ret = C(denom * (xs*di-ya*dr), denom * (xs*dr+ya*di));
            }
        }
        else { // compute nu(z) estimate and do general continued fraction
            faddeeva_algorithm += 5;
            const double c0=3.9, c1=11.398, c2=0.08254, c3=0.1421, c4=0.2023; // fit
            double nu = floor(c0 + c1 / (c2*x + c3*ya + c4));
            double wr = xs, wi = ya;
            for (nu = 0.5 * (nu - 1); nu > 0.4; nu -= 0.5) {
                // w <- z - nu/w:
                double denom = nu / (wr*wr + wi*wi);
                wr = xs - wr * denom;
                wi = ya + wi * denom;
            }
            { // w(z) = i/sqrt(pi) / w:
                double denom = ispi / (wr*wr + wi*wi);
                ret = C(denom*wi, denom*wr);
            }
        }
        if (y < 0) {
            faddeeva_algorithm += 10;
            // use w(z) = 2.0*exp(-z*z) - w(-z),
            // but be careful of overflow in exp(-z*z)
            //                                = exp(-(xs*xs-ya*ya) -2*i*xs*ya)
            return complex_sub_cc(complex_mul_rc(2.0,cexp(C((ya-xs)*(xs+ya), 2*xs*y))), ret);
        }
        else
            return ret;
    }

    /* Note: The test that seems to be suggested in the paper is x <
       sqrt(-log(DBL_MIN)), about 26.6, since otherwise exp(-x^2)
       underflows to zero and sum1,sum2,sum4 are zero.  However, long
       before this occurs, the sum1,sum2,sum4 contributions are
       negligible in double precision; I find that this happens for x >
       about 6, for all y.  On the other hand, I find that the case
       where we compute all of the sums is faster (at least with the
       precomputed expa2n2 table) until about x=10.  Furthermore, if we
       try to compute all of the sums for x > 20, I find that we
       sometimes run into numerical problems because underflow/overflow
       problems start to appear in the various coefficients of the sums,
       below.  Therefore, we use x < 10 here. */
    else if (x < 10) {

        faddeeva_algorithm = 200;

        double prod2ax = 1, prodm2ax = 1;
        double expx2;

        if (isnan(y)) {
            faddeeva_algorithm += 99;
            return C(y,y);
        }

        if (x < 5e-4) { // compute sum4 and sum5 together as sum5-sum4
                        // This special case is needed for accuracy.
            faddeeva_algorithm += 1;
            const double x2 = x*x;
            expx2 = 1 - x2 * (1 - 0.5*x2); // exp(-x*x) via Taylor
            // compute exp(2*a*x) and exp(-2*a*x) via Taylor, to double precision
            const double ax2 = 1.036642960860171859744*x; // 2*a*x
            const double exp2ax =
                1 + ax2 * (1 + ax2 * (0.5 + 0.166666666666666666667*ax2));
            const double expm2ax =
                1 - ax2 * (1 - ax2 * (0.5 - 0.166666666666666666667*ax2));
            for (int n = 1; ; ++n) {
                ++faddeeva_nofterms;
                const double coef = expa2n2[n-1] * expx2 / (a2*(n*n) + y*y);
                prod2ax *= exp2ax;
                prodm2ax *= expm2ax;
                sum1 += coef;
                sum2 += coef * prodm2ax;
                sum3 += coef * prod2ax;

                // really = sum5 - sum4
                sum5 += coef * (2*a) * n * sinh_taylor((2*a)*n*x);

                // test convergence via sum3
                if (coef * prod2ax < relerr * sum3) break;
            }
        }
        else { // x > 5e-4, compute sum4 and sum5 separately
            faddeeva_algorithm += 2;
            expx2 = exp(-x*x);
            const double exp2ax = exp((2*a)*x), expm2ax = 1 / exp2ax;
            for (int n = 1; ; ++n) {
                ++faddeeva_nofterms;
                const double coef = expa2n2[n-1] * expx2 / (a2*(n*n) + y*y);
                prod2ax *= exp2ax;
                prodm2ax *= expm2ax;
                sum1 += coef;
                sum2 += coef * prodm2ax;
                sum4 += (coef * prodm2ax) * (a*n);
                sum3 += coef * prod2ax;
                sum5 += (coef * prod2ax) * (a*n);
                // test convergence via sum5, since this sum has the slowest decay
                if ((coef * prod2ax) * (a*n) < relerr * sum5) break;
            }
        }
        const double expx2erfcxy = // avoid spurious overflow for large negative y
            y > -6 // for y < -6, erfcx(y) = 2*exp(y*y) to double precision
            ? expx2*erfcx(y) : 2*exp(y*y-x*x);
        if (y > 5) { // imaginary terms cancel
            faddeeva_algorithm += 10;
            const double sinxy = sin(x*y);
            ret = C((expx2erfcxy - c*y*sum1) * cos(2*x*y) + (c*x*expx2) * sinxy * sinc(x*y, sinxy), 0.0);
        }
        else {
            faddeeva_algorithm += 20;
            double xs = creal(z);
            const double sinxy = sin(xs*y);
            const double sin2xy = sin(2*xs*y), cos2xy = cos(2*xs*y);
            const double coef1 = expx2erfcxy - c*y*sum1;
            const double coef2 = c*xs*expx2;
            ret = C(coef1 * cos2xy + coef2 * sinxy * sinc(xs*y, sinxy),
                    coef2 * sinc(2*xs*y, sin2xy) - coef1 * sin2xy);
        }
    }
    else { // x large: only sum3 & sum5 contribute (see above note)

        faddeeva_algorithm = 300;

        if (isnan(x))
            return C(x,x);
        if (isnan(y))
            return C(y,y);

        ret = C(exp(-x*x),0.0); // |y| < 1e-10, so we only need exp(-x*x) term
        // (round instead of ceil as in original paper; note that x/a > 1 here)
        double n0 = floor(x/a + 0.5); // sum in both directions, starting at n0
        double dx = a*n0 - x;
        sum3 = exp(-dx*dx) / (a2*(n0*n0) + y*y);
        sum5 = a*n0 * sum3;
        double exp1 = exp(4*a*dx), exp1dn = 1;
        int dn;
        for (dn = 1; n0 - dn > 0; ++dn) { // loop over n0-dn and n0+dn terms
            double np = n0 + dn, nm = n0 - dn;
            double tp = exp(-sqr(a*dn+dx));
            double tm = tp * (exp1dn *= exp1); // trick to get tm from tp
            tp /= (a2*(np*np) + y*y);
            tm /= (a2*(nm*nm) + y*y);
            sum3 += tp + tm;
            sum5 += a * (np * tp + nm * tm);
            if (a * (np * tp + nm * tm) < relerr * sum5) goto finish;
        }
        while (1) { // loop over n0+dn terms only (since n0-dn <= 0)
            double np = n0 + dn++;
            double tp = exp(-sqr(a*dn+dx)) / (a2*(np*np) + y*y);
            sum3 += tp;
            sum5 += a * np * tp;
            if (a * np * tp < relerr * sum5) goto finish;
        }
    }
finish:
    return complex_add_cc(ret, C((0.5*c)*y*(sum2+sum3),(0.5*c)*copysign(sum5-sum4, creal(z))));
} // w_of_z