/*************************************************************************\
* Copyright (c) 2002 The University of Chicago, as Operator of Argonne
* National Laboratory.
* Copyright (c) 2002 The Regents of the University of California, as
* Operator of Los Alamos National Laboratory.
* This file is distributed subject to a Software License Agreement found
* in the file LICENSE that is included with this distribution. 
\*************************************************************************/

/* file: csbend.c
 * contents:  track_through_canonical_sbend()
 *
 *
 * Michael Borland, 1991, 1992.
 */
#include "mdb.h"
#include "track.h"
#include "csbend.h"
#ifdef HAVE_GPU
#include "gpu_base.h"
#include "gpu_csbend.h"
#include "gpu_funcs.h"
#endif

/* global variables */
long negativeWarningsLeft = 100;
long dipoleFringeWarning = 0;
long expansionOrder1 = 11;  /* order of expansion+1 */
long hasSkew = 0, hasNormal = 0;
double rho0, rho_actual, rad_coef=0, isrConstant=0;
double meanPhotonsPerRadian0, meanPhotonsPerMeter0, normalizedCriticalEnergy0;
long distributionBasedRadiation, includeOpeningAngle;
long photonCount = 0;
double energyCount = 0, radiansTotal = 0;
double **Fx_xy = NULL, **Fy_xy = NULL;

void convolveArrays1(double *output, long n, double *a1, double *a2);
void dipoleFringeKHwang(double *Qf, double *Qi,
			double rho, double inFringe, long higherOrder, double K1, double edge, double gap,
			double fint, double Rhe);
void dipoleFringeKHwangRLindberg(double *Qf, double *Qi,
				 double rho, double inFringe, double K1, double edge,
				 double gap, double fint, double Rhe);


void addRadiationKick(double *Qx, double *Qy, double *dPoP, double *sigmaDelta2,
		      double x, double y, double theta, double thetaf, double h0, double Fx, double Fy,
		      double ds, double radCoef, double dsISR, double isrCoef,
                      long distributionBased, long includeOpeningAngle,
                      double meanPhotonsPerMeter,
                      double normalizedCriticalEnergy, double Po);
double pickNormalizedPhotonEnergy(double RN);

long integrate_csbend_ordn(double *Qf, double *Qi, double *sigmaDelta2, double s, long n, long i, double rho0, double p0,
                           double *dz_lost, MULT_APERTURE_DATA *apData, short integration_order);
long integrate_csbend_ordn_expanded(double *Qf, double *Qi, double *sigmaDelta2, double s, long n, long i, double rho0, double p0,
                                    double *dz_lost, MULT_APERTURE_DATA *apData, short integration_order);
void convertFromCSBendCoords(double **part, long np, double rho0, 
			     double cos_ttilt, double sin_ttilt, long ctMode);
void convertToCSBendCoords(double **part, long np, double rho0, 
			     double cos_ttilt, double sin_ttilt, long ctMode);
void applyFilterTable(double *function, long bins, double dt, long fValues,
                      double *fFreq, double *fReal, double *fImag);

long correctDistribution(double *array, long npoints, double desiredSum);

void convertToDipoleCanonicalCoordinates(double *Qi, long expanded);
void convertFromDipoleCanonicalCoordinates(double *Qi, long expanded);

long inversePoissonCDF(double mu, double C);

void setUpCsbendPhotonOutputFile(CSBEND *csbend, char *rootname, long np);
void logPhoton(double Ep, double x, double xp, double y, double yp, double theta, double thetaf, double rho);
SDDS_DATASET *SDDSphotons;
long photonRows;
double photonLowEnergyCutoff;

#define RECORD_TRAJECTORY 1
#define SUBTRACT_TRAJECTORY 2
short refTrajectoryMode = 0;
long refTrajectoryPoints = 0;
double **refTrajectoryData = NULL;

void applySimpleDipoleEdgeKick(double *xp, double *yp, double x, double y, double delta, double rho, double ea, 
                               double psi, double kickLimit, long expanded);

void computeCSBENDFields(double *Fx, double *Fy, double x, double y)
{
  double xp[11], yp[11];
  double sumFx=0, sumFy=0;
  long i, j, j0, dj;

  if (!hasSkew && !hasNormal) {
    *Fx = 0;
    *Fy = 1;
    return;
  }

  xp[0] = yp[0] = 1;
  for (i=1; i<expansionOrder1; i++) {
    xp[i] = xp[i-1]*x;
    yp[i] = yp[i-1]*y;
  }

  dj = hasSkew ? 1 : 2;
  j0 = hasSkew? 0 : 1;

  for (i=0; i<expansionOrder1; i++)
    /* Note: using expansionOrder-i here ensures that for x^i*y^j , i+j<=(expansionOrder1-1) */
    for (j=j0; j<expansionOrder1-i; j+=dj)
      sumFx += Fx_xy[i][j]*xp[i]*yp[j];
  *Fx = sumFx;

  for (i=0; i<expansionOrder1; i++)
    for (j=0; j<expansionOrder1-i; j+=dj)
      sumFy += Fy_xy[i][j]*xp[i]*yp[j];
  *Fy = sumFy;
}

void computeCSBENDFieldCoefficients(double *b, double *c, double h1, long nonlinear, long expansionOrder)
{
  long i;
  double h[20];

  if (expansionOrder==0) {
    /* set the order to be <highestMultipole>+2 */
    for (i=8; i>=0; i--)
      if (b[i] || c[i])
        break;
    if ((expansionOrder = i+2)<4)
      /* make minimum value 4 for backward compatibility */
      expansionOrder = 4;  
  }

  expansionOrder1 = expansionOrder + 1;
  if (expansionOrder1>11)
    bombElegant("expansion order >10 for CSBEND or CSRCSBEND", NULL);

  hasSkew = hasNormal = 0;
  for (i=0; i<9; i++) {
    if (b[i])
      hasNormal = 1;
    if (c[i]) 
      hasSkew = 1;
  }

  if (!Fx_xy)
    Fx_xy = (double**)czarray_2d(sizeof(double), 11, 11);
  if (!Fy_xy)
    Fy_xy = (double**)czarray_2d(sizeof(double), 11, 11);
    
  for (i=0; i<expansionOrder1; i++) {
    memset(Fx_xy[i], 0, expansionOrder1*sizeof(double));
    memset(Fy_xy[i], 0, expansionOrder1*sizeof(double));
  }
  
  h[0] = 1;
  for (i=1; i<20; i++)
    h[i] = h[i-1]*h1;

  Fx_xy[0][0] = c[0];
  Fy_xy[0][0] = 1 - b[0];

  /* these increments allow using the previous indexing from when c[0] and b[0] where the quadrupole etc. */
  b += 1;
  c += 1;
  
  Fx_xy[0][1] = b[0];
  Fy_xy[1][0] = b[0];
  Fy_xy[0][1] = c[0];
  Fx_xy[1][0] = -c[0];

  if (nonlinear) {
    Fy_xy[0][2] = -(h[1]*b[0])/2 - b[1]/2;
    Fy_xy[0][3] = (h[2]*c[0])/6 - (h[1]*c[1])/3 - c[2]/6;
    Fy_xy[0][4] = (h[3]*b[0])/24 - (h[2]*b[1])/24 + (h[1]*b[2])/12 + b[3]/24;
    Fy_xy[0][5] = (-3*h[4]*c[0])/40 + (h[3]*c[1])/20 - (h[2]*c[2])/40 + (h[1]*c[3])/40 + c[4]/120;
    Fy_xy[0][6] = -(h[5]*b[0])/80 + (h[4]*b[1])/80 - (h[3]*b[2])/120 + (h[2]*b[3])/240 - (h[1]*b[4])/240 - b[5]/720;
    Fy_xy[0][7] = (5*h[6]*c[0])/112 - (h[5]*c[1])/40 + (17*h[4]*c[2])/1680 - (h[3]*c[3])/280 + (h[2]*c[4])/840 - 
      (h[1]*c[5])/1260 - c[6]/5040;
    Fy_xy[0][8] = (5*h[7]*b[0])/896 - (5*h[6]*b[1])/896 + (h[5]*b[2])/320 - (17*h[4]*b[3])/13440 + (h[3]*b[4])/2240 - 
      (h[2]*b[5])/6720 + (h[1]*b[6])/10080 + b[7]/40320;
    Fy_xy[0][9] = (-35*h[8]*c[0])/1152 + (65*h[7]*c[1])/4032 - (145*h[6]*c[2])/24192 + (43*h[5]*c[3])/24192 - 
      (11*h[4]*c[4])/24192 + (h[3]*c[5])/9072 - (h[2]*c[6])/36288 + (h[1]*c[7])/72576;
    Fy_xy[0][10] = (-7*h[9]*b[0])/2304 + (7*h[8]*b[1])/2304 - (13*h[7]*b[2])/8064 + (29*h[6]*b[3])/48384 - 
      (43*h[5]*b[4])/241920 + (11*h[4]*b[5])/241920 - (h[3]*b[6])/90720 + (h[2]*b[7])/362880;
    Fy_xy[1][1] = h[1]*c[0] + c[1];
    Fy_xy[1][2] = (h[2]*b[0])/2 - (h[1]*b[1])/2 - b[2]/2;
    Fy_xy[1][3] = -(h[3]*c[0])/2 + (h[2]*c[1])/2 - (h[1]*c[2])/3 - c[3]/6;
    Fy_xy[1][4] = -(h[4]*b[0])/8 + (h[3]*b[1])/8 - (h[2]*b[2])/8 + (h[1]*b[3])/12 + b[4]/24;
    Fy_xy[1][5] = (3*h[5]*c[0])/8 - (9*h[4]*c[1])/40 + (h[3]*c[2])/10 - (h[2]*c[3])/20 + (h[1]*c[4])/40 + c[5]/120;
    Fy_xy[1][6] = (h[6]*b[0])/16 - (h[5]*b[1])/16 + (3*h[4]*b[2])/80 - (h[3]*b[3])/60 + (h[2]*b[4])/120 - (h[1]*b[5])/240 - 
      b[6]/720;
    Fy_xy[1][7] = (-5*h[7]*c[0])/16 + (19*h[6]*c[1])/112 - (11*h[5]*c[2])/168 + (h[4]*c[3])/48 - (h[3]*c[4])/168 + 
      (h[2]*c[5])/504 - (h[1]*c[6])/1260 - c[7]/5040;
    Fy_xy[1][8] = (-5*h[8]*b[0])/128 + (5*h[7]*b[1])/128 - (19*h[6]*b[2])/896 + (11*h[5]*b[3])/1344 - (h[4]*b[4])/384 + 
      (h[3]*b[5])/1344 - (h[2]*b[6])/4032 + (h[1]*b[7])/10080;
    Fy_xy[1][9] = (35*h[9]*c[0])/128 - (55*h[8]*c[1])/384 + (5*h[7]*c[2])/96 - (5*h[6]*c[3])/336 + (29*h[5]*c[4])/8064 - 
      (19*h[4]*c[5])/24192 + (h[3]*c[6])/6048 - (h[2]*c[7])/24192;
    Fy_xy[1][10] = (7*h[10]*b[0])/256 - (7*h[9]*b[1])/256 + (11*h[8]*b[2])/768 - (h[7]*b[3])/192 + (h[6]*b[4])/672 - 
      (29*h[5]*b[5])/80640 + (19*h[4]*b[6])/241920 - (h[3]*b[7])/60480;
    Fy_xy[2][0] = b[1]/2;
    Fy_xy[2][1] = -(h[2]*c[0]) + (h[1]*c[1])/2 + c[2]/2;
    Fy_xy[2][2] = -(h[3]*b[0])/2 + (h[2]*b[1])/2 - (h[1]*b[2])/4 - b[3]/4;
    Fy_xy[2][3] = h[4]*c[0] - (3*h[3]*c[1])/4 + (5*h[2]*c[2])/12 - (h[1]*c[3])/6 - c[4]/12;
    Fy_xy[2][4] = (h[5]*b[0])/4 - (h[4]*b[1])/4 + (3*h[3]*b[2])/16 - (5*h[2]*b[3])/48 + (h[1]*b[4])/24 + b[5]/48;
    Fy_xy[2][5] = (-9*h[6]*c[0])/8 + (51*h[5]*c[1])/80 - (21*h[4]*c[2])/80 + (h[3]*c[3])/10 - (3*h[2]*c[4])/80 + 
      (h[1]*c[5])/80 + c[6]/240;
    Fy_xy[2][6] = (-3*h[7]*b[0])/16 + (3*h[6]*b[1])/16 - (17*h[5]*b[2])/160 + (7*h[4]*b[3])/160 - (h[3]*b[4])/60 + 
      (h[2]*b[5])/160 - (h[1]*b[6])/480 - b[7]/1440;
    Fy_xy[2][7] = (5*h[8]*c[0])/4 - (149*h[7]*c[1])/224 + (167*h[6]*c[2])/672 - (25*h[5]*c[3])/336 + (13*h[4]*c[4])/672 - 
      (5*h[3]*c[5])/1008 + (h[2]*c[6])/720 - (h[1]*c[7])/2520;
    Fy_xy[2][8] = (5*h[9]*b[0])/32 - (5*h[8]*b[1])/32 + (149*h[7]*b[2])/1792 - (167*h[6]*b[3])/5376 + 
      (25*h[5]*b[4])/2688 - (13*h[4]*b[5])/5376 + (5*h[3]*b[6])/8064 - (h[2]*b[7])/5760;
    Fy_xy[2][9] = (-175*h[10]*c[0])/128 + (545*h[9]*c[1])/768 - (65*h[8]*c[2])/256 + (95*h[7]*c[3])/1344 - 
      (265*h[6]*c[4])/16128 + (163*h[5]*c[5])/48384 - (31*h[4]*c[6])/48384 + (h[3]*c[7])/8064;
    Fy_xy[2][10] = (-35*h[11]*b[0])/256 + (35*h[10]*b[1])/256 - (109*h[9]*b[2])/1536 + (13*h[8]*b[3])/512 - 
      (19*h[7]*b[4])/2688 + (53*h[6]*b[5])/32256 - (163*h[5]*b[6])/483840 + (31*h[4]*b[7])/483840;
    Fy_xy[3][0] = b[2]/6;
    Fy_xy[3][1] = h[3]*c[0] - (h[2]*c[1])/2 + (h[1]*c[2])/6 + c[3]/6;
    Fy_xy[3][2] = (h[4]*b[0])/2 - (h[3]*b[1])/2 + (h[2]*b[2])/4 - (h[1]*b[3])/12 - b[4]/12;
    Fy_xy[3][3] = (-5*h[5]*c[0])/3 + (13*h[4]*c[1])/12 - (19*h[3]*c[2])/36 + (7*h[2]*c[3])/36 - (h[1]*c[4])/18 - c[5]/36;
    Fy_xy[3][4] = (-5*h[6]*b[0])/12 + (5*h[5]*b[1])/12 - (13*h[4]*b[2])/48 + (19*h[3]*b[3])/144 - (7*h[2]*b[4])/144 + 
      (h[1]*b[5])/72 + b[6]/144;
    Fy_xy[3][5] = (21*h[7]*c[0])/8 - (23*h[6]*c[1])/16 + (9*h[5]*c[2])/16 - (3*h[4]*c[3])/16 + (7*h[3]*c[4])/120 - 
      (h[2]*c[5])/60 + (h[1]*c[6])/240 + c[7]/720;
    Fy_xy[3][6] = (7*h[8]*b[0])/16 - (7*h[7]*b[1])/16 + (23*h[6]*b[2])/96 - (3*h[5]*b[3])/32 + (h[4]*b[4])/32 - 
      (7*h[3]*b[5])/720 + (h[2]*b[6])/360 - (h[1]*b[7])/1440;
    Fy_xy[3][7] = (-15*h[9]*c[0])/4 + (63*h[8]*c[1])/32 - (23*h[7]*c[2])/32 + (139*h[6]*c[3])/672 - (17*h[5]*c[4])/336 + 
      (23*h[4]*c[5])/2016 - (13*h[3]*c[6])/5040 + (h[2]*c[7])/1680;
    Fy_xy[3][8] = (-15*h[10]*b[0])/32 + (15*h[9]*b[1])/32 - (63*h[8]*b[2])/256 + (23*h[7]*b[3])/256 - 
      (139*h[6]*b[4])/5376 + (17*h[5]*b[5])/2688 - (23*h[4]*b[6])/16128 + (13*h[3]*b[7])/40320;
    Fy_xy[3][9] = (1925*h[11]*c[0])/384 - (1985*h[10]*c[1])/768 + (2105*h[9]*c[2])/2304 - (575*h[8]*c[3])/2304 + 
      (65*h[7]*c[4])/1152 - (115*h[6]*c[5])/10368 + (41*h[5]*c[6])/20736 - (7*h[4]*c[7])/20736;
    Fy_xy[3][10] = (385*h[12]*b[0])/768 - (385*h[11]*b[1])/768 + (397*h[10]*b[2])/1536 - (421*h[9]*b[3])/4608 + 
      (115*h[8]*b[4])/4608 - (13*h[7]*b[5])/2304 + (23*h[6]*b[6])/20736 - (41*h[5]*b[7])/207360;
    Fy_xy[4][0] = b[3]/24;
    Fy_xy[4][1] = -(h[4]*c[0]) + (h[3]*c[1])/2 - (h[2]*c[2])/6 + (h[1]*c[3])/24 + c[4]/24;
    Fy_xy[4][2] = -(h[5]*b[0])/2 + (h[4]*b[1])/2 - (h[3]*b[2])/4 + (h[2]*b[3])/12 - (h[1]*b[4])/48 - b[5]/48;
    Fy_xy[4][3] = (5*h[6]*c[0])/2 - (3*h[5]*c[1])/2 + (2*h[4]*c[2])/3 - (11*h[3]*c[3])/48 + (h[2]*c[4])/16 - (h[1]*c[5])/72 - 
      c[6]/144;
    Fy_xy[4][4] = (5*h[7]*b[0])/8 - (5*h[6]*b[1])/8 + (3*h[5]*b[2])/8 - (h[4]*b[3])/6 + (11*h[3]*b[4])/192 - 
      (h[2]*b[5])/64 + (h[1]*b[6])/288 + b[7]/576;
    Fy_xy[4][5] = (-21*h[8]*c[0])/4 + (45*h[7]*c[1])/16 - (17*h[6]*c[2])/16 + (21*h[5]*c[3])/64 - (29*h[4]*c[4])/320 + 
      (11*h[3]*c[5])/480 - (h[2]*c[6])/192 + (h[1]*c[7])/960;
    Fy_xy[4][6] = (-7*h[9]*b[0])/8 + (7*h[8]*b[1])/8 - (15*h[7]*b[2])/32 + (17*h[6]*b[3])/96 - (7*h[5]*b[4])/128 + 
      (29*h[4]*b[5])/1920 - (11*h[3]*b[6])/2880 + (h[2]*b[7])/1152;
    Fy_xy[4][7] = (75*h[10]*c[0])/8 - (39*h[9]*c[1])/8 + (7*h[8]*c[2])/4 - (439*h[7]*c[3])/896 + (103*h[6]*c[4])/896 - 
      (97*h[5]*c[5])/4032 + (193*h[4]*c[6])/40320 - (19*h[3]*c[7])/20160;
    Fy_xy[4][8] = (75*h[11]*b[0])/64 - (75*h[10]*b[1])/64 + (39*h[9]*b[2])/64 - (7*h[8]*b[3])/32 + (439*h[7]*b[4])/7168 - 
      (103*h[6]*b[5])/7168 + (97*h[5]*b[6])/32256 - (193*h[4]*b[7])/322560;
    Fy_xy[4][9] = (-1925*h[12]*c[0])/128 + (1975*h[11]*c[1])/256 - (2075*h[10]*c[2])/768 + (745*h[9]*c[3])/1024 - 
      (165*h[8]*c[4])/1024 + (425*h[7]*c[5])/13824 - (145*h[6]*c[6])/27648 + (23*h[5]*c[7])/27648;
    Fy_xy[4][10] = (-385*h[13]*b[0])/256 + (385*h[12]*b[1])/256 - (395*h[11]*b[2])/512 + (415*h[10]*b[3])/1536 - 
      (149*h[9]*b[4])/2048 + (33*h[8]*b[5])/2048 - (85*h[7]*b[6])/27648 + (29*h[6]*b[7])/55296;
    Fy_xy[5][0] = b[4]/120;
    Fy_xy[5][1] = h[5]*c[0] - (h[4]*c[1])/2 + (h[3]*c[2])/6 - (h[2]*c[3])/24 + (h[1]*c[4])/120 + c[5]/120;
    Fy_xy[5][2] = (h[6]*b[0])/2 - (h[5]*b[1])/2 + (h[4]*b[2])/4 - (h[3]*b[3])/12 + (h[2]*b[4])/48 - (h[1]*b[5])/240 - 
      b[6]/240;
    Fy_xy[5][3] = (-7*h[7]*c[0])/2 + 2*h[6]*c[1] - (5*h[5]*c[2])/6 + (13*h[4]*c[3])/48 - (17*h[3]*c[4])/240 + 
      (11*h[2]*c[5])/720 - (h[1]*c[6])/360 - c[7]/720;
    Fy_xy[5][4] = (-7*h[8]*b[0])/8 + (7*h[7]*b[1])/8 - (h[6]*b[2])/2 + (5*h[5]*b[3])/24 - (13*h[4]*b[4])/192 + 
      (17*h[3]*b[5])/960 - (11*h[2]*b[6])/2880 + (h[1]*b[7])/1440;
    Fy_xy[5][5] = (189*h[9]*c[0])/20 - (399*h[8]*c[1])/80 + (147*h[7]*c[2])/80 - (173*h[6]*c[3])/320 + 
      (221*h[5]*c[4])/1600 - (51*h[4]*c[5])/1600 + (h[3]*c[6])/150 - (h[2]*c[7])/800;
    Fy_xy[5][6] = (63*h[10]*b[0])/40 - (63*h[9]*b[1])/40 + (133*h[8]*b[2])/160 - (49*h[7]*b[3])/160 + 
      (173*h[6]*b[4])/1920 - (221*h[5]*b[5])/9600 + (17*h[4]*b[6])/3200 - (h[3]*b[7])/900;
    Fy_xy[5][7] = (-165*h[11]*c[0])/8 + (213*h[10]*c[1])/20 - (151*h[9]*c[2])/40 + (663*h[8]*c[3])/640 - 
      (151*h[7]*c[4])/640 + (271*h[6]*c[5])/5760 - (871*h[5]*c[6])/100800 + (307*h[4]*c[7])/201600;
    Fy_xy[5][8] = (-165*h[12]*b[0])/64 + (165*h[11]*b[1])/64 - (213*h[10]*b[2])/160 + (151*h[9]*b[3])/320 - 
      (663*h[8]*b[4])/5120 + (151*h[7]*b[5])/5120 - (271*h[6]*b[6])/46080 + (871*h[5]*b[7])/806400;
    Fy_xy[5][9] = (-13299*h[13]*c[0])/128 + (13189*h[12]*c[1])/256 - (4323*h[11]*c[2])/256 + (4207*h[10]*c[3])/1024 - 
      (12109*h[9]*c[4])/15360 + (17051*h[8]*c[5])/138240 - (1927*h[7]*c[6])/120960 + (403*h[6]*c[7])/241920;
    Fy_xy[5][10] = (-13299*h[14]*b[0])/1280 + (13299*h[13]*b[1])/1280 - (13189*h[12]*b[2])/2560 + 
      (4323*h[11]*b[3])/2560 - (4207*h[10]*b[4])/10240 + (12109*h[9]*b[5])/153600 - (17051*h[8]*b[6])/1382400 + 
      (1927*h[7]*b[7])/1209600;
    Fy_xy[6][0] = b[5]/720;
    Fy_xy[6][1] = -(h[6]*c[0]) + (h[5]*c[1])/2 - (h[4]*c[2])/6 + (h[3]*c[3])/24 - (h[2]*c[4])/120 + (h[1]*c[5])/720 + 
      c[6]/720;
    Fy_xy[6][2] = -(h[7]*b[0])/2 + (h[6]*b[1])/2 - (h[5]*b[2])/4 + (h[4]*b[3])/12 - (h[3]*b[4])/48 + (h[2]*b[5])/240 - 
      (h[1]*b[6])/1440 - b[7]/1440;
    Fy_xy[6][3] = (14*h[8]*c[0])/3 - (31*h[7]*c[1])/12 + (37*h[6]*c[2])/36 - (23*h[5]*c[3])/72 + (29*h[4]*c[4])/360 - 
      (73*h[3]*c[5])/4320 + (13*h[2]*c[6])/4320 - (h[1]*c[7])/2160;
    Fy_xy[6][4] = (7*h[9]*b[0])/6 - (7*h[8]*b[1])/6 + (31*h[7]*b[2])/48 - (37*h[6]*b[3])/144 + (23*h[5]*b[4])/288 - 
      (29*h[4]*b[5])/1440 + (73*h[3]*b[6])/17280 - (13*h[2]*b[7])/17280;
    Fy_xy[6][5] = (-63*h[10]*c[0])/4 + (329*h[9]*c[1])/40 - (119*h[8]*c[2])/40 + (271*h[7]*c[3])/320 - 
      (197*h[6]*c[4])/960 + (17*h[5]*c[5])/384 - (83*h[4]*c[6])/9600 + (11*h[3]*c[7])/7200;
    Fy_xy[6][6] = (-21*h[11]*b[0])/8 + (21*h[10]*b[1])/8 - (329*h[9]*b[2])/240 + (119*h[8]*b[3])/240 - 
      (271*h[7]*b[4])/1920 + (197*h[6]*b[5])/5760 - (17*h[5]*b[6])/2304 + (83*h[4]*b[7])/57600;
    Fy_xy[6][7] = (165*h[12]*c[0])/4 - (339*h[11]*c[1])/16 + (119*h[10]*c[2])/16 - (193*h[9]*c[3])/96 + 
      (43*h[8]*c[4])/96 - (199*h[7]*c[5])/2304 + (1213*h[6]*c[6])/80640 - (11*h[5]*c[7])/4480;
    Fy_xy[6][8] = (165*h[13]*b[0])/32 - (165*h[12]*b[1])/32 + (339*h[11]*b[2])/128 - (119*h[10]*b[3])/128 + 
      (193*h[9]*b[4])/768 - (43*h[8]*b[5])/768 + (199*h[7]*b[6])/18432 - (1213*h[6]*b[7])/645120;
    Fy_xy[6][9] = (56485*h[14]*c[0])/384 - (55825*h[13]*c[1])/768 + (54505*h[12]*c[2])/2304 - (52435*h[11]*c[3])/9216 + 
      (9887*h[10]*c[4])/9216 - (27113*h[9]*c[5])/165888 + (23489*h[8]*c[6])/1161216 - (71*h[7]*c[7])/36288;
    Fy_xy[6][10] = (11297*h[15]*b[0])/768 - (11297*h[14]*b[1])/768 + (11165*h[13]*b[2])/1536 - (10901*h[12]*b[3])/4608 + 
      (10487*h[11]*b[4])/18432 - (9887*h[10]*b[5])/92160 + (27113*h[9]*b[6])/1658880 - (23489*h[8]*b[7])/11612160;
    Fy_xy[7][0] = b[6]/5040;
    Fy_xy[7][1] = h[7]*c[0] - (h[6]*c[1])/2 + (h[5]*c[2])/6 - (h[4]*c[3])/24 + (h[3]*c[4])/120 - (h[2]*c[5])/720 + 
      (h[1]*c[6])/5040 + c[7]/5040;
    Fy_xy[7][2] = (h[8]*b[0])/2 - (h[7]*b[1])/2 + (h[6]*b[2])/4 - (h[5]*b[3])/12 + (h[4]*b[4])/48 - (h[3]*b[5])/240 + 
      (h[2]*b[6])/1440 - (h[1]*b[7])/10080;
    Fy_xy[7][3] = -6*h[9]*c[0] + (13*h[8]*c[1])/4 - (5*h[7]*c[2])/4 + (3*h[6]*c[3])/8 - (11*h[5]*c[4])/120 + 
      (3*h[4]*c[5])/160 - (11*h[3]*c[6])/3360 + (h[2]*c[7])/2016;
    Fy_xy[7][4] = (-3*h[10]*b[0])/2 + (3*h[9]*b[1])/2 - (13*h[8]*b[2])/16 + (5*h[7]*b[3])/16 - (3*h[6]*b[4])/32 + 
      (11*h[5]*b[5])/480 - (3*h[4]*b[6])/640 + (11*h[3]*b[7])/13440;
    Fy_xy[7][5] = (99*h[11]*c[0])/4 - (513*h[10]*c[1])/40 + (183*h[9]*c[2])/40 - (407*h[8]*c[3])/320 + (19*h[7]*c[4])/64 - 
      (39*h[6]*c[5])/640 + (757*h[5]*c[6])/67200 - (127*h[4]*c[7])/67200;
    Fy_xy[7][6] = (33*h[12]*b[0])/8 - (33*h[11]*b[1])/8 + (171*h[10]*b[2])/80 - (61*h[9]*b[3])/80 + (407*h[8]*b[4])/1920 - 
      (19*h[7]*b[5])/384 + (13*h[6]*b[6])/1280 - (757*h[5]*b[7])/403200;
    Fy_xy[7][7] = (4719*h[13]*c[0])/28 - (9339*h[12]*c[1])/112 + (3047*h[11]*c[2])/112 - (1471*h[10]*c[3])/224 + 
      (199*h[9]*c[4])/160 - (15331*h[8]*c[5])/80640 + (13213*h[7]*c[6])/564480 - (1229*h[6]*c[7])/564480;
    Fy_xy[7][8] = (4719*h[14]*b[0])/224 - (4719*h[13]*b[1])/224 + (9339*h[12]*b[2])/896 - (3047*h[11]*b[3])/896 + 
      (1471*h[10]*b[4])/1792 - (199*h[9]*b[5])/1280 + (15331*h[8]*b[6])/645120 - (13213*h[7]*b[7])/4515840;
    Fy_xy[7][9] = (6721*h[15]*c[0])/128 - (40755*h[14]*c[1])/1792 + (28171*h[13]*c[2])/5376 - (1375*h[12]*c[3])/3072 - 
      (4741*h[11]*c[4])/35840 + (14081*h[10]*c[5])/215040 - (691*h[9]*c[6])/43008 + (1819*h[8]*c[7])/645120;
    Fy_xy[7][10] = (6721*h[16]*b[0])/1280 - (6721*h[15]*b[1])/1280 + (8151*h[14]*b[2])/3584 - (28171*h[13]*b[3])/53760 + 
      (275*h[12]*b[4])/6144 + (4741*h[11]*b[5])/358400 - (14081*h[10]*b[6])/2150400 + (691*h[9]*b[7])/430080;
    Fy_xy[8][0] = b[7]/40320;
    Fy_xy[8][1] = -(h[8]*c[0]) + (h[7]*c[1])/2 - (h[6]*c[2])/6 + (h[5]*c[3])/24 - (h[4]*c[4])/120 + (h[3]*c[5])/720 - 
      (h[2]*c[6])/5040 + (h[1]*c[7])/40320;
    Fy_xy[8][2] = -(h[9]*b[0])/2 + (h[8]*b[1])/2 - (h[7]*b[2])/4 + (h[6]*b[3])/12 - (h[5]*b[4])/48 + (h[4]*b[5])/240 - 
      (h[3]*b[6])/1440 + (h[2]*b[7])/10080;
    Fy_xy[8][3] = (15*h[10]*c[0])/2 - 4*h[9]*c[1] + (3*h[8]*c[2])/2 - (7*h[7]*c[3])/16 + (5*h[6]*c[4])/48 - 
      (h[5]*c[5])/48 + (h[4]*c[6])/280 - (43*h[3]*c[7])/80640;
    Fy_xy[8][4] = (15*h[11]*b[0])/8 - (15*h[10]*b[1])/8 + h[9]*b[2] - (3*h[8]*b[3])/8 + (7*h[7]*b[4])/64 - 
      (5*h[6]*b[5])/192 + (h[5]*b[6])/192 - (h[4]*b[7])/1120;
    Fy_xy[8][5] = (-297*h[12]*c[0])/8 + (153*h[11]*c[1])/8 - (27*h[10]*c[2])/4 + (59*h[9]*c[3])/32 - (67*h[8]*c[4])/160 + 
      (53*h[7]*c[5])/640 - (197*h[6]*c[6])/13440 + (253*h[5]*c[7])/107520;
    Fy_xy[8][6] = (-99*h[13]*b[0])/16 + (99*h[12]*b[1])/16 - (51*h[11]*b[2])/16 + (9*h[10]*b[3])/8 - (59*h[9]*b[4])/192 + 
      (67*h[8]*b[5])/960 - (53*h[7]*b[6])/3840 + (197*h[6]*b[7])/80640;
    Fy_xy[8][7] = (-22737*h[14]*c[0])/112 + (2805*h[13]*c[1])/28 - (3641*h[12]*c[2])/112 + (3485*h[11]*c[3])/448 - 
      (3257*h[10]*c[4])/2240 + (4393*h[9]*c[5])/20160 - (367*h[8]*c[6])/14112 + (10291*h[7]*c[7])/4515840;
    Fy_xy[8][8] = (-22737*h[15]*b[0])/896 + (22737*h[14]*b[1])/896 - (2805*h[13]*b[2])/224 + (3641*h[12]*b[3])/896 - 
      (3485*h[11]*b[4])/3584 + (3257*h[10]*b[5])/17920 - (4393*h[9]*b[6])/161280 + (367*h[8]*b[7])/112896;
    Fy_xy[8][9] = (-24167*h[16]*c[0])/448 + (40755*h[15]*c[1])/1792 - (25597*h[14]*c[2])/5376 + (3355*h[13]*c[3])/21504 + 
      (8327*h[12]*c[4])/35840 - (57751*h[11]*c[5])/645120 + (18455*h[10]*c[6])/903168 - (13787*h[9]*c[7])/4014080;
    Fy_xy[8][10] = (-24167*h[17]*b[0])/4480 + (24167*h[16]*b[1])/4480 - (8151*h[15]*b[2])/3584 + 
      (25597*h[14]*b[3])/53760 - (671*h[13]*b[4])/43008 - (8327*h[12]*b[5])/358400 + (57751*h[11]*b[6])/6451200 - 
      (3691*h[10]*b[7])/1806336;
    Fy_xy[9][0] = 0;
    Fy_xy[9][1] = h[9]*c[0] - (h[8]*c[1])/2 + (h[7]*c[2])/6 - (h[6]*c[3])/24 + (h[5]*c[4])/120 - (h[4]*c[5])/720 + 
      (h[3]*c[6])/5040 - (h[2]*c[7])/40320;
    Fy_xy[9][2] = (h[10]*b[0])/2 - (h[9]*b[1])/2 + (h[8]*b[2])/4 - (h[7]*b[3])/12 + (h[6]*b[4])/48 - (h[5]*b[5])/240 + 
      (h[4]*b[6])/1440 - (h[3]*b[7])/10080;
    Fy_xy[9][3] = (-55*h[11]*c[0])/6 + (29*h[10]*c[1])/6 - (16*h[9]*c[2])/9 + (73*h[8]*c[3])/144 - (17*h[7]*c[4])/144 + 
      (5*h[6]*c[5])/216 - (59*h[5]*c[6])/15120 + (139*h[4]*c[7])/241920;
    Fy_xy[9][4] = (-55*h[12]*b[0])/24 + (55*h[11]*b[1])/24 - (29*h[10]*b[2])/24 + (4*h[9]*b[3])/9 - (73*h[8]*b[4])/576 + 
      (17*h[7]*b[5])/576 - (5*h[6]*b[6])/864 + (59*h[5]*b[7])/60480;
    Fy_xy[9][5] = (-715*h[13]*c[0])/8 + 44*h[12]*c[1] - (341*h[11]*c[2])/24 + (323*h[10]*c[3])/96 - (59*h[9]*c[4])/96 + 
      (101*h[8]*c[5])/1152 - (379*h[7]*c[6])/40320 + (197*h[6]*c[7])/322560;
    Fy_xy[9][6] = (-715*h[14]*b[0])/48 + (715*h[13]*b[1])/48 - (22*h[12]*b[2])/3 + (341*h[11]*b[3])/144 - 
      (323*h[10]*b[4])/576 + (59*h[9]*b[5])/576 - (101*h[8]*b[6])/6912 + (379*h[7]*b[7])/241920;
    Fy_xy[9][7] = (-2145*h[15]*c[0])/112 + (715*h[14]*c[1])/112 - (1045*h[12]*c[3])/1344 + (473*h[11]*c[4])/1344 - 
      (193*h[10]*c[5])/2016 + (535*h[9]*c[6])/28224 - (2609*h[8]*c[7])/903168;
    Fy_xy[9][8] = (-2145*h[16]*b[0])/896 + (2145*h[15]*b[1])/896 - (715*h[14]*b[2])/896 + (1045*h[12]*b[4])/10752 - 
      (473*h[11]*b[5])/10752 + (193*h[10]*b[6])/16128 - (535*h[9]*b[7])/225792;
    Fy_xy[9][9] = (32175*h[17]*c[0])/448 - (9295*h[16]*c[1])/256 + (22165*h[15]*c[2])/1792 - (202345*h[14]*c[3])/64512 + 
      (39325*h[13]*c[4])/64512 - (103345*h[12]*c[5])/1161216 + (70477*h[11]*c[6])/8128512 - (12937*h[10]*c[7])/65028096;
    Fy_xy[9][10] = (6435*h[18]*b[0])/896 - (6435*h[17]*b[1])/896 + (1859*h[16]*b[2])/512 - (4433*h[15]*b[3])/3584 + 
      (40469*h[14]*b[4])/129024 - (7865*h[13]*b[5])/129024 + (20669*h[12]*b[6])/2322432 - (70477*h[11]*b[7])/81285120;
    Fy_xy[10][0] = 0;
    Fy_xy[10][1] = -(h[10]*c[0]) + (h[9]*c[1])/2 - (h[8]*c[2])/6 + (h[7]*c[3])/24 - (h[6]*c[4])/120 + (h[5]*c[5])/720 - 
      (h[4]*c[6])/5040 + (h[3]*c[7])/40320;
    Fy_xy[10][2] = -(h[11]*b[0])/2 + (h[10]*b[1])/2 - (h[9]*b[2])/4 + (h[8]*b[3])/12 - (h[7]*b[4])/48 + (h[6]*b[5])/240 - 
      (h[5]*b[6])/1440 + (h[4]*b[7])/10080;
    Fy_xy[10][3] = 11*h[12]*c[0] - (23*h[11]*c[1])/4 + (25*h[10]*c[2])/12 - (7*h[9]*c[3])/12 + (2*h[8]*c[4])/15 - 
      (37*h[7]*c[5])/1440 + (43*h[6]*c[6])/10080 - (5*h[5]*c[7])/8064;
    Fy_xy[10][4] = (11*h[13]*b[0])/4 - (11*h[12]*b[1])/4 + (23*h[11]*b[2])/16 - (25*h[10]*b[3])/48 + (7*h[9]*b[4])/48 - 
      (h[8]*b[5])/30 + (37*h[7]*b[6])/5760 - (43*h[6]*b[7])/40320;
    Fy_xy[10][5] = (3861*h[14]*c[0])/40 - (759*h[13]*c[1])/16 + (1221*h[12]*c[2])/80 - (115*h[11]*c[3])/32 + 
      (521*h[10]*c[4])/800 - (147*h[9]*c[5])/1600 + (13*h[8]*c[6])/1344 - (107*h[7]*c[7])/179200;
    Fy_xy[10][6] = (1287*h[15]*b[0])/80 - (1287*h[14]*b[1])/80 + (253*h[13]*b[2])/32 - (407*h[12]*b[3])/160 + 
      (115*h[11]*b[4])/192 - (521*h[10]*b[5])/4800 + (49*h[9]*b[6])/3200 - (13*h[8]*b[7])/8064;
    Fy_xy[10][7] = (1287*h[16]*c[0])/70 - (1287*h[15]*c[1])/224 - (429*h[14]*c[2])/1120 + (209*h[13]*c[3])/224 - 
      (1111*h[12]*c[4])/2800 + (21247*h[11]*c[5])/201600 - (5801*h[10]*c[6])/282240 + (17453*h[9]*c[7])/5644800;
    Fy_xy[10][8] = (1287*h[17]*b[0])/560 - (1287*h[16]*b[1])/560 + (1287*h[15]*b[2])/1792 + (429*h[14]*b[3])/8960 - 
      (209*h[13]*b[4])/1792 + (1111*h[12]*b[5])/22400 - (21247*h[11]*b[6])/1612800 + (5801*h[10]*b[7])/2257920;
    Fy_xy[10][9] = (-170599*h[18]*c[0])/2240 + (34463*h[17]*c[1])/896 - (175747*h[16]*c[2])/13440 + 
      (10153*h[15]*c[3])/3072 - (49049*h[14]*c[4])/76800 + (892309*h[13]*c[5])/9676800 - (118217*h[12]*c[6])/13547520 + 
      (72007*h[11]*c[7])/541900800;
    Fy_xy[10][10] = (-170599*h[19]*b[0])/22400 + (170599*h[18]*b[1])/22400 - (34463*h[17]*b[2])/8960 + 
      (175747*h[16]*b[3])/134400 - (10153*h[15]*b[4])/30720 + (49049*h[14]*b[5])/768000 - (892309*h[13]*b[6])/96768000 + 
      (118217*h[12]*b[7])/135475200;

    Fx_xy[1][1] = b[1];
    Fx_xy[0][2] = (h[1]*c[0])/2 + c[1]/2;
    Fx_xy[0][3] = (h[2]*b[0])/6 - (h[1]*b[1])/6 - b[2]/6;
    Fx_xy[0][4] = -(h[3]*c[0])/8 + (h[2]*c[1])/8 - (h[1]*c[2])/12 - c[3]/24;
    Fx_xy[0][5] = -(h[4]*b[0])/40 + (h[3]*b[1])/40 - (h[2]*b[2])/40 + (h[1]*b[3])/60 + b[4]/120;
    Fx_xy[0][6] = (h[5]*c[0])/16 - (3*h[4]*c[1])/80 + (h[3]*c[2])/60 - (h[2]*c[3])/120 + (h[1]*c[4])/240 + c[5]/720;
    Fx_xy[0][7] = (h[6]*b[0])/112 - (h[5]*b[1])/112 + (3*h[4]*b[2])/560 - (h[3]*b[3])/420 + (h[2]*b[4])/840 - 
      (h[1]*b[5])/1680 - b[6]/5040;
    Fx_xy[0][8] = (-5*h[7]*c[0])/128 + (19*h[6]*c[1])/896 - (11*h[5]*c[2])/1344 + (h[4]*c[3])/384 - (h[3]*c[4])/1344 + 
      (h[2]*c[5])/4032 - (h[1]*c[6])/10080 - c[7]/40320;
    Fx_xy[0][9] = (-5*h[8]*b[0])/1152 + (5*h[7]*b[1])/1152 - (19*h[6]*b[2])/8064 + (11*h[5]*b[3])/12096 - 
      (h[4]*b[4])/3456 + (h[3]*b[5])/12096 - (h[2]*b[6])/36288 + (h[1]*b[7])/90720;
    Fx_xy[0][10] = (7*h[9]*c[0])/256 - (11*h[8]*c[1])/768 + (h[7]*c[2])/192 - (h[6]*c[3])/672 + (29*h[5]*c[4])/80640 - 
      (19*h[4]*c[5])/241920 + (h[3]*c[6])/60480 - (h[2]*c[7])/241920;
    Fx_xy[1][2] = -(h[2]*c[0]) + (h[1]*c[1])/2 + c[2]/2;
    Fx_xy[1][3] = -(h[3]*b[0])/3 + (h[2]*b[1])/3 - (h[1]*b[2])/6 - b[3]/6;
    Fx_xy[1][4] = (h[4]*c[0])/2 - (3*h[3]*c[1])/8 + (5*h[2]*c[2])/24 - (h[1]*c[3])/12 - c[4]/24;
    Fx_xy[1][5] = (h[5]*b[0])/10 - (h[4]*b[1])/10 + (3*h[3]*b[2])/40 - (h[2]*b[3])/24 + (h[1]*b[4])/60 + b[5]/120;
    Fx_xy[1][6] = (-3*h[6]*c[0])/8 + (17*h[5]*c[1])/80 - (7*h[4]*c[2])/80 + (h[3]*c[3])/30 - (h[2]*c[4])/80 + 
      (h[1]*c[5])/240 + c[6]/720;
    Fx_xy[1][7] = (-3*h[7]*b[0])/56 + (3*h[6]*b[1])/56 - (17*h[5]*b[2])/560 + (h[4]*b[3])/80 - (h[3]*b[4])/210 + 
      (h[2]*b[5])/560 - (h[1]*b[6])/1680 - b[7]/5040;
    Fx_xy[1][8] = (5*h[8]*c[0])/16 - (149*h[7]*c[1])/896 + (167*h[6]*c[2])/2688 - (25*h[5]*c[3])/1344 + 
      (13*h[4]*c[4])/2688 - (5*h[3]*c[5])/4032 + (h[2]*c[6])/2880 - (h[1]*c[7])/10080;
    Fx_xy[1][9] = (5*h[9]*b[0])/144 - (5*h[8]*b[1])/144 + (149*h[7]*b[2])/8064 - (167*h[6]*b[3])/24192 + 
      (25*h[5]*b[4])/12096 - (13*h[4]*b[5])/24192 + (5*h[3]*b[6])/36288 - (h[2]*b[7])/25920;
    Fx_xy[1][10] = (-35*h[10]*c[0])/128 + (109*h[9]*c[1])/768 - (13*h[8]*c[2])/256 + (19*h[7]*c[3])/1344 - 
      (53*h[6]*c[4])/16128 + (163*h[5]*c[5])/241920 - (31*h[4]*c[6])/241920 + (h[3]*c[7])/40320;
    Fx_xy[2][0] = -c[1]/2;
    Fx_xy[2][1] = b[2]/2;
    Fx_xy[2][2] = (3*h[3]*c[0])/2 - (3*h[2]*c[1])/4 + (h[1]*c[2])/4 + c[3]/4;
    Fx_xy[2][3] = (h[4]*b[0])/2 - (h[3]*b[1])/2 + (h[2]*b[2])/4 - (h[1]*b[3])/12 - b[4]/12;
    Fx_xy[2][4] = (-5*h[5]*c[0])/4 + (13*h[4]*c[1])/16 - (19*h[3]*c[2])/48 + (7*h[2]*c[3])/48 - (h[1]*c[4])/24 - c[5]/48;
    Fx_xy[2][5] = -(h[6]*b[0])/4 + (h[5]*b[1])/4 - (13*h[4]*b[2])/80 + (19*h[3]*b[3])/240 - (7*h[2]*b[4])/240 + 
      (h[1]*b[5])/120 + b[6]/240;
    Fx_xy[2][6] = (21*h[7]*c[0])/16 - (23*h[6]*c[1])/32 + (9*h[5]*c[2])/32 - (3*h[4]*c[3])/32 + (7*h[3]*c[4])/240 - 
      (h[2]*c[5])/120 + (h[1]*c[6])/480 + c[7]/1440;
    Fx_xy[2][7] = (3*h[8]*b[0])/16 - (3*h[7]*b[1])/16 + (23*h[6]*b[2])/224 - (9*h[5]*b[3])/224 + (3*h[4]*b[4])/224 - 
      (h[3]*b[5])/240 + (h[2]*b[6])/840 - (h[1]*b[7])/3360;
    Fx_xy[2][8] = (-45*h[9]*c[0])/32 + (189*h[8]*c[1])/256 - (69*h[7]*c[2])/256 + (139*h[6]*c[3])/1792 - 
      (17*h[5]*c[4])/896 + (23*h[4]*c[5])/5376 - (13*h[3]*c[6])/13440 + (h[2]*c[7])/4480;
    Fx_xy[2][9] = (-5*h[10]*b[0])/32 + (5*h[9]*b[1])/32 - (21*h[8]*b[2])/256 + (23*h[7]*b[3])/768 - 
      (139*h[6]*b[4])/16128 + (17*h[5]*b[5])/8064 - (23*h[4]*b[6])/48384 + (13*h[3]*b[7])/120960;
    Fx_xy[2][10] = (385*h[11]*c[0])/256 - (397*h[10]*c[1])/512 + (421*h[9]*c[2])/1536 - (115*h[8]*c[3])/1536 + 
      (13*h[7]*c[4])/768 - (23*h[6]*c[5])/6912 + (41*h[5]*c[6])/69120 - (7*h[4]*c[7])/69120;
    Fx_xy[3][0] = -c[2]/6;
    Fx_xy[3][1] = b[3]/6;
    Fx_xy[3][2] = -2*h[4]*c[0] + h[3]*c[1] - (h[2]*c[2])/3 + (h[1]*c[3])/12 + c[4]/12;
    Fx_xy[3][3] = (-2*h[5]*b[0])/3 + (2*h[4]*b[1])/3 - (h[3]*b[2])/3 + (h[2]*b[3])/9 - (h[1]*b[4])/36 - b[5]/36;
    Fx_xy[3][4] = (5*h[6]*c[0])/2 - (3*h[5]*c[1])/2 + (2*h[4]*c[2])/3 - (11*h[3]*c[3])/48 + (h[2]*c[4])/16 - (h[1]*c[5])/72 - 
      c[6]/144;
    Fx_xy[3][5] = (h[7]*b[0])/2 - (h[6]*b[1])/2 + (3*h[5]*b[2])/10 - (2*h[4]*b[3])/15 + (11*h[3]*b[4])/240 - 
      (h[2]*b[5])/80 + (h[1]*b[6])/360 + b[7]/720;
    Fx_xy[3][6] = (-7*h[8]*c[0])/2 + (15*h[7]*c[1])/8 - (17*h[6]*c[2])/24 + (7*h[5]*c[3])/32 - (29*h[4]*c[4])/480 + 
      (11*h[3]*c[5])/720 - (h[2]*c[6])/288 + (h[1]*c[7])/1440;
    Fx_xy[3][7] = -(h[9]*b[0])/2 + (h[8]*b[1])/2 - (15*h[7]*b[2])/56 + (17*h[6]*b[3])/168 - (h[5]*b[4])/32 + 
      (29*h[4]*b[5])/3360 - (11*h[3]*b[6])/5040 + (h[2]*b[7])/2016;
    Fx_xy[3][8] = (75*h[10]*c[0])/16 - (39*h[9]*c[1])/16 + (7*h[8]*c[2])/8 - (439*h[7]*c[3])/1792 + (103*h[6]*c[4])/1792 - 
      (97*h[5]*c[5])/8064 + (193*h[4]*c[6])/80640 - (19*h[3]*c[7])/40320;
    Fx_xy[3][9] = (25*h[11]*b[0])/48 - (25*h[10]*b[1])/48 + (13*h[9]*b[2])/48 - (7*h[8]*b[3])/72 + (439*h[7]*b[4])/16128 - 
      (103*h[6]*b[5])/16128 + (97*h[5]*b[6])/72576 - (193*h[4]*b[7])/725760;
    Fx_xy[3][10] = (-385*h[12]*c[0])/64 + (395*h[11]*c[1])/128 - (415*h[10]*c[2])/384 + (149*h[9]*c[3])/512 - 
      (33*h[8]*c[4])/512 + (85*h[7]*c[5])/6912 - (29*h[6]*c[6])/13824 + (23*h[5]*c[7])/69120;
    Fx_xy[4][0] = -c[3]/24;
    Fx_xy[4][1] = b[4]/24;
    Fx_xy[4][2] = (5*h[5]*c[0])/2 - (5*h[4]*c[1])/4 + (5*h[3]*c[2])/12 - (5*h[2]*c[3])/48 + (h[1]*c[4])/48 + c[5]/48;
    Fx_xy[4][3] = (5*h[6]*b[0])/6 - (5*h[5]*b[1])/6 + (5*h[4]*b[2])/12 - (5*h[3]*b[3])/36 + (5*h[2]*b[4])/144 - 
      (h[1]*b[5])/144 - b[6]/144;
    Fx_xy[4][4] = (-35*h[7]*c[0])/8 + (5*h[6]*c[1])/2 - (25*h[5]*c[2])/24 + (65*h[4]*c[3])/192 - (17*h[3]*c[4])/192 + 
      (11*h[2]*c[5])/576 - (h[1]*c[6])/288 - c[7]/576;
    Fx_xy[4][5] = (-7*h[8]*b[0])/8 + (7*h[7]*b[1])/8 - (h[6]*b[2])/2 + (5*h[5]*b[3])/24 - (13*h[4]*b[4])/192 + 
      (17*h[3]*b[5])/960 - (11*h[2]*b[6])/2880 + (h[1]*b[7])/1440;
    Fx_xy[4][6] = (63*h[9]*c[0])/8 - (133*h[8]*c[1])/32 + (49*h[7]*c[2])/32 - (173*h[6]*c[3])/384 + (221*h[5]*c[4])/1920 - 
      (17*h[4]*c[5])/640 + (h[3]*c[6])/180 - (h[2]*c[7])/960;
    Fx_xy[4][7] = (9*h[10]*b[0])/8 - (9*h[9]*b[1])/8 + (19*h[8]*b[2])/32 - (7*h[7]*b[3])/32 + (173*h[6]*b[4])/2688 - 
      (221*h[5]*b[5])/13440 + (17*h[4]*b[6])/4480 - (h[3]*b[7])/1260;
    Fx_xy[4][8] = (-825*h[11]*c[0])/64 + (213*h[10]*c[1])/32 - (151*h[9]*c[2])/64 + (663*h[8]*c[3])/1024 - 
      (151*h[7]*c[4])/1024 + (271*h[6]*c[5])/9216 - (871*h[5]*c[6])/161280 + (307*h[4]*c[7])/322560;
    Fx_xy[4][9] = (-275*h[12]*b[0])/192 + (275*h[11]*b[1])/192 - (71*h[10]*b[2])/96 + (151*h[9]*b[3])/576 - 
      (221*h[8]*b[4])/3072 + (151*h[7]*b[5])/9216 - (271*h[6]*b[6])/82944 + (871*h[5]*b[7])/1451520;
    Fx_xy[4][10] = (-13299*h[13]*c[0])/256 + (13189*h[12]*c[1])/512 - (4323*h[11]*c[2])/512 + (4207*h[10]*c[3])/2048 - 
      (12109*h[9]*c[4])/30720 + (17051*h[8]*c[5])/276480 - (1927*h[7]*c[6])/241920 + (403*h[6]*c[7])/483840;
    Fx_xy[5][0] = -c[4]/120;
    Fx_xy[5][1] = b[5]/120;
    Fx_xy[5][2] = -3*h[6]*c[0] + (3*h[5]*c[1])/2 - (h[4]*c[2])/2 + (h[3]*c[3])/8 - (h[2]*c[4])/40 + (h[1]*c[5])/240 + 
      c[6]/240;
    Fx_xy[5][3] = -(h[7]*b[0]) + h[6]*b[1] - (h[5]*b[2])/2 + (h[4]*b[3])/6 - (h[3]*b[4])/24 + (h[2]*b[5])/120 - 
      (h[1]*b[6])/720 - b[7]/720;
    Fx_xy[5][4] = 7*h[8]*c[0] - (31*h[7]*c[1])/8 + (37*h[6]*c[2])/24 - (23*h[5]*c[3])/48 + (29*h[4]*c[4])/240 - 
      (73*h[3]*c[5])/2880 + (13*h[2]*c[6])/2880 - (h[1]*c[7])/1440;
    Fx_xy[5][5] = (7*h[9]*b[0])/5 - (7*h[8]*b[1])/5 + (31*h[7]*b[2])/40 - (37*h[6]*b[3])/120 + (23*h[5]*b[4])/240 - 
      (29*h[4]*b[5])/1200 + (73*h[3]*b[6])/14400 - (13*h[2]*b[7])/14400;
    Fx_xy[5][6] = (-63*h[10]*c[0])/4 + (329*h[9]*c[1])/40 - (119*h[8]*c[2])/40 + (271*h[7]*c[3])/320 - 
      (197*h[6]*c[4])/960 + (17*h[5]*c[5])/384 - (83*h[4]*c[6])/9600 + (11*h[3]*c[7])/7200;
    Fx_xy[5][7] = (-9*h[11]*b[0])/4 + (9*h[10]*b[1])/4 - (47*h[9]*b[2])/40 + (17*h[8]*b[3])/40 - (271*h[7]*b[4])/2240 + 
      (197*h[6]*b[5])/6720 - (17*h[5]*b[6])/2688 + (83*h[4]*b[7])/67200;
    Fx_xy[5][8] = (495*h[12]*c[0])/16 - (1017*h[11]*c[1])/64 + (357*h[10]*c[2])/64 - (193*h[9]*c[3])/128 + 
      (43*h[8]*c[4])/128 - (199*h[7]*c[5])/3072 + (1213*h[6]*c[6])/107520 - (33*h[5]*c[7])/17920;
    Fx_xy[5][9] = (55*h[13]*b[0])/16 - (55*h[12]*b[1])/16 + (113*h[11]*b[2])/64 - (119*h[10]*b[3])/192 + 
      (193*h[9]*b[4])/1152 - (43*h[8]*b[5])/1152 + (199*h[7]*b[6])/27648 - (1213*h[6]*b[7])/967680;
    Fx_xy[5][10] = (11297*h[14]*c[0])/128 - (11165*h[13]*c[1])/256 + (10901*h[12]*c[2])/768 - (10487*h[11]*c[3])/3072 + 
      (9887*h[10]*c[4])/15360 - (27113*h[9]*c[5])/276480 + (23489*h[8]*c[6])/1935360 - (71*h[7]*c[7])/60480;
    Fx_xy[6][0] = -c[5]/720;
    Fx_xy[6][1] = b[6]/720;
    Fx_xy[6][2] = (7*h[7]*c[0])/2 - (7*h[6]*c[1])/4 + (7*h[5]*c[2])/12 - (7*h[4]*c[3])/48 + (7*h[3]*c[4])/240 - 
      (7*h[2]*c[5])/1440 + (h[1]*c[6])/1440 + c[7]/1440;
    Fx_xy[6][3] = (7*h[8]*b[0])/6 - (7*h[7]*b[1])/6 + (7*h[6]*b[2])/12 - (7*h[5]*b[3])/36 + (7*h[4]*b[4])/144 - 
      (7*h[3]*b[5])/720 + (7*h[2]*b[6])/4320 - (h[1]*b[7])/4320;
    Fx_xy[6][4] = (-21*h[9]*c[0])/2 + (91*h[8]*c[1])/16 - (35*h[7]*c[2])/16 + (21*h[6]*c[3])/32 - (77*h[5]*c[4])/480 + 
      (21*h[4]*c[5])/640 - (11*h[3]*c[6])/1920 + (h[2]*c[7])/1152;
    Fx_xy[6][5] = (-21*h[10]*b[0])/10 + (21*h[9]*b[1])/10 - (91*h[8]*b[2])/80 + (7*h[7]*b[3])/16 - (21*h[6]*b[4])/160 + 
      (77*h[5]*b[5])/2400 - (21*h[4]*b[6])/3200 + (11*h[3]*b[7])/9600;
    Fx_xy[6][6] = (231*h[11]*c[0])/8 - (1197*h[10]*c[1])/80 + (427*h[9]*c[2])/80 - (2849*h[8]*c[3])/1920 + 
      (133*h[7]*c[4])/384 - (91*h[6]*c[5])/1280 + (757*h[5]*c[6])/57600 - (127*h[4]*c[7])/57600;
    Fx_xy[6][7] = (33*h[12]*b[0])/8 - (33*h[11]*b[1])/8 + (171*h[10]*b[2])/80 - (61*h[9]*b[3])/80 + (407*h[8]*b[4])/1920 - 
      (19*h[7]*b[5])/384 + (13*h[6]*b[6])/1280 - (757*h[5]*b[7])/403200;
    Fx_xy[6][8] = (4719*h[13]*c[0])/32 - (9339*h[12]*c[1])/128 + (3047*h[11]*c[2])/128 - (1471*h[10]*c[3])/256 + 
      (1393*h[9]*c[4])/1280 - (15331*h[8]*c[5])/92160 + (13213*h[7]*c[6])/645120 - (1229*h[6]*c[7])/645120;
    Fx_xy[6][9] = (1573*h[14]*b[0])/96 - (1573*h[13]*b[1])/96 + (3113*h[12]*b[2])/384 - (3047*h[11]*b[3])/1152 + 
      (1471*h[10]*b[4])/2304 - (1393*h[9]*b[5])/11520 + (15331*h[8]*b[6])/829440 - (13213*h[7]*b[7])/5806080;
    Fx_xy[6][10] = (47047*h[15]*c[0])/1280 - (8151*h[14]*c[1])/512 + (28171*h[13]*c[2])/7680 - (1925*h[12]*c[3])/6144 - 
      (4741*h[11]*c[4])/51200 + (14081*h[10]*c[5])/307200 - (691*h[9]*c[6])/61440 + (1819*h[8]*c[7])/921600;
    Fx_xy[7][0] = -c[6]/5040;
    Fx_xy[7][1] = b[7]/5040;
    Fx_xy[7][2] = -4*h[8]*c[0] + 2*h[7]*c[1] - (2*h[6]*c[2])/3 + (h[5]*c[3])/6 - (h[4]*c[4])/30 + (h[3]*c[5])/180 - 
      (h[2]*c[6])/1260 + (h[1]*c[7])/10080;
    Fx_xy[7][3] = (-4*h[9]*b[0])/3 + (4*h[8]*b[1])/3 - (2*h[7]*b[2])/3 + (2*h[6]*b[3])/9 - (h[5]*b[4])/18 + 
      (h[4]*b[5])/90 - (h[3]*b[6])/540 + (h[2]*b[7])/3780;
    Fx_xy[7][4] = 15*h[10]*c[0] - 8*h[9]*c[1] + 3*h[8]*c[2] - (7*h[7]*c[3])/8 + (5*h[6]*c[4])/24 - (h[5]*c[5])/24 + 
      (h[4]*c[6])/140 - (43*h[3]*c[7])/40320;
    Fx_xy[7][5] = 3*h[11]*b[0] - 3*h[10]*b[1] + (8*h[9]*b[2])/5 - (3*h[8]*b[3])/5 + (7*h[7]*b[4])/40 - (h[6]*b[5])/24 + 
      (h[5]*b[6])/120 - (h[4]*b[7])/700;
    Fx_xy[7][6] = (-99*h[12]*c[0])/2 + (51*h[11]*c[1])/2 - 9*h[10]*c[2] + (59*h[9]*c[3])/24 - (67*h[8]*c[4])/120 + 
      (53*h[7]*c[5])/480 - (197*h[6]*c[6])/10080 + (253*h[5]*c[7])/80640;
    Fx_xy[7][7] = (-99*h[13]*b[0])/14 + (99*h[12]*b[1])/14 - (51*h[11]*b[2])/14 + (9*h[10]*b[3])/7 - (59*h[9]*b[4])/168 + 
      (67*h[8]*b[5])/840 - (53*h[7]*b[6])/3360 + (197*h[6]*b[7])/70560;
    Fx_xy[7][8] = (-22737*h[14]*c[0])/112 + (2805*h[13]*c[1])/28 - (3641*h[12]*c[2])/112 + (3485*h[11]*c[3])/448 - 
      (3257*h[10]*c[4])/2240 + (4393*h[9]*c[5])/20160 - (367*h[8]*c[6])/14112 + (10291*h[7]*c[7])/4515840;
    Fx_xy[7][9] = (-7579*h[15]*b[0])/336 + (7579*h[14]*b[1])/336 - (935*h[13]*b[2])/84 + (3641*h[12]*b[3])/1008 - 
      (3485*h[11]*b[4])/4032 + (3257*h[10]*b[5])/20160 - (4393*h[9]*b[6])/181440 + (367*h[8]*b[7])/127008;
    Fx_xy[7][10] = (-24167*h[16]*c[0])/560 + (8151*h[15]*c[1])/448 - (25597*h[14]*c[2])/6720 + (671*h[13]*c[3])/5376 + 
      (8327*h[12]*c[4])/44800 - (57751*h[11]*c[5])/806400 + (3691*h[10]*c[6])/225792 - (13787*h[9]*c[7])/5017600;
    Fx_xy[8][0] = -c[7]/40320;
    Fx_xy[8][1] = 0;
    Fx_xy[8][2] = (9*h[9]*c[0])/2 - (9*h[8]*c[1])/4 + (3*h[7]*c[2])/4 - (3*h[6]*c[3])/16 + (3*h[5]*c[4])/80 - 
      (h[4]*c[5])/160 + (h[3]*c[6])/1120 - (h[2]*c[7])/8960;
    Fx_xy[8][3] = (3*h[10]*b[0])/2 - (3*h[9]*b[1])/2 + (3*h[8]*b[2])/4 - (h[7]*b[3])/4 + (h[6]*b[4])/16 - (h[5]*b[5])/80 + 
      (h[4]*b[6])/480 - (h[3]*b[7])/3360;
    Fx_xy[8][4] = (-165*h[11]*c[0])/8 + (87*h[10]*c[1])/8 - 4*h[9]*c[2] + (73*h[8]*c[3])/64 - (17*h[7]*c[4])/64 + 
      (5*h[6]*c[5])/96 - (59*h[5]*c[6])/6720 + (139*h[4]*c[7])/107520;
    Fx_xy[8][5] = (-33*h[12]*b[0])/8 + (33*h[11]*b[1])/8 - (87*h[10]*b[2])/40 + (4*h[9]*b[3])/5 - (73*h[8]*b[4])/320 + 
      (17*h[7]*b[5])/320 - (h[6]*b[6])/96 + (59*h[5]*b[7])/33600;
    Fx_xy[8][6] = (-2145*h[13]*c[0])/16 + 66*h[12]*c[1] - (341*h[11]*c[2])/16 + (323*h[10]*c[3])/64 - (59*h[9]*c[4])/64 + 
      (101*h[8]*c[5])/768 - (379*h[7]*c[6])/26880 + (197*h[6]*c[7])/215040;
    Fx_xy[8][7] = (-2145*h[14]*b[0])/112 + (2145*h[13]*b[1])/112 - (66*h[12]*b[2])/7 + (341*h[11]*b[3])/112 - 
      (323*h[10]*b[4])/448 + (59*h[9]*b[5])/448 - (101*h[8]*b[6])/5376 + (379*h[7]*b[7])/188160;
    Fx_xy[8][8] = (-19305*h[15]*c[0])/896 + (6435*h[14]*c[1])/896 - (3135*h[12]*c[3])/3584 + (1419*h[11]*c[4])/3584 - 
      (193*h[10]*c[5])/1792 + (535*h[9]*c[6])/25088 - (2609*h[8]*c[7])/802816;
    Fx_xy[8][9] = (-2145*h[16]*b[0])/896 + (2145*h[15]*b[1])/896 - (715*h[14]*b[2])/896 + (1045*h[12]*b[4])/10752 - 
      (473*h[11]*b[5])/10752 + (193*h[10]*b[6])/16128 - (535*h[9]*b[7])/225792;
    Fx_xy[8][10] = (57915*h[17]*c[0])/896 - (16731*h[16]*c[1])/512 + (39897*h[15]*c[2])/3584 - (40469*h[14]*c[3])/14336 + 
      (7865*h[13]*c[4])/14336 - (20669*h[12]*c[5])/258048 + (70477*h[11]*c[6])/9031680 - (12937*h[10]*c[7])/72253440;
    Fx_xy[9][0] = 0;
    Fx_xy[9][1] = 0;
    Fx_xy[9][2] = -5*h[10]*c[0] + (5*h[9]*c[1])/2 - (5*h[8]*c[2])/6 + (5*h[7]*c[3])/24 - (h[6]*c[4])/24 + (h[5]*c[5])/144 - 
      (h[4]*c[6])/1008 + (h[3]*c[7])/8064;
    Fx_xy[9][3] = (-5*h[11]*b[0])/3 + (5*h[10]*b[1])/3 - (5*h[9]*b[2])/6 + (5*h[8]*b[3])/18 - (5*h[7]*b[4])/72 + 
      (h[6]*b[5])/72 - (h[5]*b[6])/432 + (h[4]*b[7])/3024;
    Fx_xy[9][4] = (55*h[12]*c[0])/2 - (115*h[11]*c[1])/8 + (125*h[10]*c[2])/24 - (35*h[9]*c[3])/24 + (h[8]*c[4])/3 - 
      (37*h[7]*c[5])/576 + (43*h[6]*c[6])/4032 - (25*h[5]*c[7])/16128;
    Fx_xy[9][5] = (11*h[13]*b[0])/2 - (11*h[12]*b[1])/2 + (23*h[11]*b[2])/8 - (25*h[10]*b[3])/24 + (7*h[9]*b[4])/24 - 
      (h[8]*b[5])/15 + (37*h[7]*b[6])/2880 - (43*h[6]*b[7])/20160;
    Fx_xy[9][6] = (1287*h[14]*c[0])/8 - (1265*h[13]*c[1])/16 + (407*h[12]*c[2])/16 - (575*h[11]*c[3])/96 + 
      (521*h[10]*c[4])/480 - (49*h[9]*c[5])/320 + (65*h[8]*c[6])/4032 - (107*h[7]*c[7])/107520;
    Fx_xy[9][7] = (1287*h[15]*b[0])/56 - (1287*h[14]*b[1])/56 + (1265*h[13]*b[2])/112 - (407*h[12]*b[3])/112 + 
      (575*h[11]*b[4])/672 - (521*h[10]*b[5])/3360 + (7*h[9]*b[6])/320 - (65*h[8]*b[7])/28224;
    Fx_xy[9][8] = (1287*h[16]*c[0])/56 - (6435*h[15]*c[1])/896 - (429*h[14]*c[2])/896 + (1045*h[13]*c[3])/896 - 
      (1111*h[12]*c[4])/2240 + (21247*h[11]*c[5])/161280 - (5801*h[10]*c[6])/225792 + (17453*h[9]*c[7])/4515840;
    Fx_xy[9][9] = (143*h[17]*b[0])/56 - (143*h[16]*b[1])/56 + (715*h[15]*b[2])/896 + (143*h[14]*b[3])/2688 - 
      (1045*h[13]*b[4])/8064 + (1111*h[12]*b[5])/20160 - (21247*h[11]*b[6])/1451520 + (5801*h[10]*b[7])/2032128;
    Fx_xy[9][10] = (-170599*h[18]*c[0])/2240 + (34463*h[17]*c[1])/896 - (175747*h[16]*c[2])/13440 + 
      (10153*h[15]*c[3])/3072 - (49049*h[14]*c[4])/76800 + (892309*h[13]*c[5])/9676800 - (118217*h[12]*c[6])/13547520 + 
      (72007*h[11]*c[7])/541900800;
    Fx_xy[10][0] = 0;
    Fx_xy[10][1] = 0;
    Fx_xy[10][2] = (11*h[11]*c[0])/2 - (11*h[10]*c[1])/4 + (11*h[9]*c[2])/12 - (11*h[8]*c[3])/48 + (11*h[7]*c[4])/240 - 
      (11*h[6]*c[5])/1440 + (11*h[5]*c[6])/10080 - (11*h[4]*c[7])/80640;
    Fx_xy[10][3] = (11*h[12]*b[0])/6 - (11*h[11]*b[1])/6 + (11*h[10]*b[2])/12 - (11*h[9]*b[3])/36 + (11*h[8]*b[4])/144 - 
      (11*h[7]*b[5])/720 + (11*h[6]*b[6])/4320 - (11*h[5]*b[7])/30240;
    Fx_xy[10][4] = (143*h[13]*c[0])/4 - (275*h[12]*c[1])/16 + (253*h[11]*c[2])/48 - (55*h[10]*c[3])/48 + 
      (11*h[9]*c[4])/60 - (121*h[8]*c[5])/5760 + (11*h[7]*c[6])/8064 + (11*h[6]*c[7])/161280;
    Fx_xy[10][5] = (143*h[14]*b[0])/20 - (143*h[13]*b[1])/20 + (55*h[12]*b[2])/16 - (253*h[11]*b[3])/240 + 
      (11*h[10]*b[4])/48 - (11*h[9]*b[5])/300 + (121*h[8]*b[6])/28800 - (11*h[7]*b[7])/40320;
    Fx_xy[10][6] = (-429*h[15]*c[0])/80 + (143*h[14]*c[1])/32 - (429*h[13]*c[2])/160 + (209*h[12]*c[3])/192 - 
      (1507*h[11]*c[4])/4800 + (649*h[10]*c[5])/9600 - (451*h[9]*c[6])/40320 + (1529*h[8]*c[7])/1075200;
    Fx_xy[10][7] = (-429*h[16]*b[0])/560 + (429*h[15]*b[1])/560 - (143*h[14]*b[2])/224 + (429*h[13]*b[3])/1120 - 
      (209*h[12]*b[4])/1344 + (1507*h[11]*b[5])/33600 - (649*h[10]*b[6])/67200 + (451*h[9]*b[7])/282240;
    Fx_xy[10][8] = (-21879*h[17]*c[0])/560 + (34749*h[16]*c[1])/1792 - (8151*h[15]*c[2])/1280 + (2717*h[14]*c[3])/1792 - 
      (5863*h[13]*c[4])/22400 + (46651*h[12]*c[5])/1612800 - (1133*h[11]*c[6])/2257920 - (26851*h[10]*c[7])/45158400;
    Fx_xy[10][9] = (-2431*h[18]*b[0])/560 + (2431*h[17]*b[1])/560 - (3861*h[16]*b[2])/1792 + (2717*h[15]*b[3])/3840 - 
      (2717*h[14]*b[4])/16128 + (5863*h[13]*b[5])/201600 - (46651*h[12]*b[6])/14515200 + (1133*h[11]*b[7])/20321280;
    Fx_xy[10][10] = (476333*h[19]*c[0])/22400 - (14443*h[18]*c[1])/1280 + (563849*h[17]*c[2])/134400 - 
      (260117*h[16]*c[3])/215040 + (1525381*h[15]*c[4])/5376000 - (5336903*h[14]*c[5])/96768000 + 
      (1191619*h[13]*c[6])/135475200 - (5947469*h[12]*c[7])/5419008000;
  }

  /*
  for (i=0; i<11; i++) 
    for (j=0; j<11; j++)
      printf("Fx[%ld][%ld] = %e   Fy[%ld][%ld] = %e\n", i, j, Fx_xy[i][j], i, j, Fy_xy[i][j]);
      */
}


long track_through_csbend(double **part, long n_part, CSBEND *csbend, double p_error, double Po, double **accepted,
                          double z_start, double *sigmaDelta2, char *rootname, MAXAMP *maxamp, 
                          APCONTOUR *apContour, APERTURE_DATA *apFileData, 
                          /* If iSlice non-negative, we do one step. The caller is responsible 
                           * for handling the coordinates appropriately outside this routine. 
                           * The element must have been previously optimized to determine FSE and X offsets.
                           */
                          long iSlice
                          )
{
  double h;
  long i_part, i_top, particle_lost, j;
  double rho, s, Fx, Fy;
  double x, xp, y, yp, dp, dp0;
  double n, fse, dp_prime;
  double tilt, etilt, cos_ttilt, sin_ttilt, ttilt;
  double *coord, dz_lost;
  double angle, e1, e2, Kg1, Kg2;
  double psi1, psi2, he1, he2;
  double Qi[6], Qf[6];
  double dcoord_etilt[6];
  double dxi, dyi, dzi;
  double dxf, dyf, dzf;
  double delta_xp;
  double e1_kick_limit, e2_kick_limit;
  static long largeRhoWarning = 0;
  MULT_APERTURE_DATA apertureData;

  /*
  static FILE *fpdeb = NULL;
  if (!fpdeb) {
    fpdeb = fopen("apdebug.sdds", "w");
    fprintf(fpdeb, "SDDS1\n&column name=x type=float &end\n&column name=y type=float &end\n&data mode=ascii no_row_counts=1 &end\n");
  }
  */

#ifdef HAVE_GPU
  if(getElementOnGpu()){
    startGpuTimer();
    i_part = gpu_track_through_csbend(n_part, csbend, p_error, Po, accepted, 
                                      z_start, sigmaDelta2, rootname, maxamp, apContour, apFileData, iSlice);
#ifdef GPU_VERIFY     
    startCpuTimer();
    track_through_csbend(part, n_part, csbend, p_error, Po, accepted, z_start, sigmaDelta2, rootname, maxamp, apContour, apFileData, iSlice);
    compareGpuCpu(n_part, "track_through_csbend");
#endif /* GPU_VERIFY */
    return i_part;
  }
#endif /* HAVE_GPU */

  if (!csbend)
    bombElegant("null CSBEND pointer (track_through_csbend)", NULL);

  if (iSlice>=0 && csbend->referenceCorrection && csbend->refTrajectoryChangeSet==0)
    bombElegant("One-step CSBEND tracking invoked but reference correction not completed first, which is a bug.", NULL);

  setUpCsbendPhotonOutputFile(csbend, rootname, n_part);
  
  if (csbend->edge_order>1 && (csbend->edge_effects[csbend->e1Index]==2 || csbend->edge_effects[csbend->e2Index]==2) && csbend->hgap==0)
    bombElegant("CSBEND has EDGE_ORDER>1 and EDGE[12]_EFFECTS==2, but HGAP=0. This gives undefined results.", NULL);
  
  if (csbend->referenceCorrection) {
    if (csbend->refTrajectoryChangeSet==0 || csbend->refLength!=csbend->length || csbend->refAngle!=csbend->angle || csbend->refSlices!=csbend->nSlices) {
      /* Figure out the reference trajectory offsets to suppress inaccuracy in the integrator */
      CSBEND csbend0;
      double **part0;
      TRACKING_CONTEXT tcontext;

      getTrackingContext(&tcontext);
      if (tcontext.elementOccurrence>0) {
	printf("Determining reference trajectory for CSBEND %s#%ld at s=%e\n", tcontext.elementName, tcontext.elementOccurrence, tcontext.zStart);
      }
      
      if (csbend->refTrajectoryChange && csbend->refSlices) {
        free_czarray_2d((void**)csbend->refTrajectoryChange, csbend->refSlices, 5);
        csbend->refTrajectoryChange = NULL;
        csbend->refSlices = 0;
      }
      
      part0 = (double**)czarray_2d(sizeof(double), 1, totalPropertiesPerParticle);
      memset(part0[0], 0, sizeof(**part0)*totalPropertiesPerParticle);
      memcpy(&csbend0, csbend, sizeof(*csbend));
      csbend0.dx = csbend0.dy = csbend0.dz = csbend0.fse = csbend0.etilt = csbend0.epitch = csbend0.eyaw = 
        csbend0.isr = csbend0.synch_rad = csbend0.fseDipole = csbend0.fseQuadrupole = csbend0.xKick = csbend0.yKick = 0;
      
      csbend0.refTrajectoryChange = csbend->refTrajectoryChange = (double**)czarray_2d(sizeof(double), csbend->nSlices, 5);
      refTrajectoryPoints = csbend->nSlices;
      csbend0.refLength = csbend0.length;
      csbend0.refAngle = csbend0.angle;
      csbend0.refSlices = csbend0.nSlices;
      /* This forces us into the next branch on the next call to this routine */
      csbend0.refTrajectoryChangeSet = 1;
      setTrackingContext("csbend0", 0, T_CSBEND, "none", NULL);
      track_through_csbend(part0, 1, &csbend0, p_error, Po, NULL, 0, NULL, NULL, maxamp, apContour, apFileData, -1);
      csbend->refTrajectoryChangeSet = 2;  /* indicates that reference trajectory has been determined */

      csbend->refSlices = csbend->nSlices;
      csbend->refLength = csbend->length;
      csbend->refAngle = csbend->angle;
      free_czarray_2d((void**)part0, 1, totalPropertiesPerParticle);

      refTrajectoryData = csbend->refTrajectoryChange;
      refTrajectoryPoints = csbend->refSlices;
      refTrajectoryMode = SUBTRACT_TRAJECTORY;
    } else if (csbend->refTrajectoryChangeSet==1) {
      /* indicates reference trajectory is about to be determined */
      refTrajectoryData = csbend->refTrajectoryChange;
      refTrajectoryPoints = csbend->nSlices;
      refTrajectoryMode = RECORD_TRAJECTORY;
      csbend->refTrajectoryChangeSet = 2;
    } else {
      /* assume that reference trajectory already determined */
      refTrajectoryData = csbend->refTrajectoryChange;
      refTrajectoryPoints = csbend->refSlices;
      refTrajectoryMode = SUBTRACT_TRAJECTORY;
    }
  } else
    refTrajectoryMode = 0;
  
  if (csbend->angle==0) {
    exactDrift(part, n_part, csbend->length);
    return n_part;
  }
  
  if (!(csbend->edgeFlags&BEND_EDGE_DETERMINED)) 
    bombElegant("CSBEND element doesn't have edge flags set.", NULL);
  
  if (csbend->integration_order!=2 && csbend->integration_order!=4 && csbend->integration_order!=6)
    bombElegant("CSBEND integration_order is invalid--must be 2, 4, or 6", NULL);

  rho0 =  csbend->length/csbend->angle;
  if (csbend->use_bn) {
    csbend->b[0] = 0;
    csbend->b[1] = csbend->b1;
    csbend->b[2] = csbend->b2;
    csbend->b[3] = csbend->b3;
    csbend->b[4] = csbend->b4;
    csbend->b[5] = csbend->b5;
    csbend->b[6] = csbend->b6;
    csbend->b[7] = csbend->b7;
    csbend->b[8] = csbend->b8;
  } else {
    csbend->b[0] = 0;
    csbend->b[1] = csbend->k1*rho0;
    csbend->b[2] = csbend->k2*rho0;
    csbend->b[3] = csbend->k3*rho0;
    csbend->b[4] = csbend->k4*rho0;
    csbend->b[5] = csbend->k5*rho0;
    csbend->b[6] = csbend->k6*rho0;
    csbend->b[7] = csbend->k7*rho0;
    csbend->b[8] = csbend->k8*rho0;
  }
  for (j=0; j<9; j++)
    csbend->c[j] = 0;
  if (csbend->xReference>0) {
    double term = 1/csbend->xReference, f[8], g[8];
    long i;
    f[0] = csbend->f1;
    f[1] = csbend->f2;
    f[2] = csbend->f3;
    f[3] = csbend->f4;
    f[4] = csbend->f5;
    f[5] = csbend->f6;
    f[6] = csbend->f7;
    f[7] = csbend->f8;
    g[0] = csbend->g1;
    g[1] = csbend->g2;
    g[2] = csbend->g3;
    g[3] = csbend->g4;
    g[4] = csbend->g5;
    g[5] = csbend->g6;
    g[6] = csbend->g7;
    g[7] = csbend->g8;
    for (i=0; i<8; i++) {
      csbend->b[i+1] += f[i]*term;
      csbend->c[i+1] += g[i]*term;
      term *= (i+2)/csbend->xReference;
    }    
  }
  /* these adjustments ensure that we don't apply FSE+FSEDIPOLE twice for quadrupole and sextupole terms */
  csbend->b[1] *= (1+csbend->fse+csbend->fseQuadrupole)/(1+csbend->fse+csbend->fseDipole);
  csbend->c[1] *= (1+csbend->fse+csbend->fseQuadrupole)/(1+csbend->fse+csbend->fseDipole);
  csbend->b[2] *= (1+csbend->fse)/(1+csbend->fse+csbend->fseDipole);
  csbend->c[2] *= (1+csbend->fse)/(1+csbend->fse+csbend->fseDipole);

  csbend->b[0] = csbend->xKick/csbend->angle;
  csbend->c[0] = csbend->yKick/csbend->angle;

  he1 = csbend->h[csbend->e1Index];
  he2 = csbend->h[csbend->e2Index];
  if (csbend->angle<0 && csbend->malignMethod==0) {
    long i;
    angle = -csbend->angle;
    e1    = -csbend->e[csbend->e1Index];
    e2    = -csbend->e[csbend->e2Index];
    etilt = csbend->etilt*csbend->etiltSign;
    tilt  = csbend->tilt + PI;
    rho0  = csbend->length/angle;
    for (i=1; i<9; i+=2) {
      csbend->b[i] *= -1;
      csbend->c[i] *= -1;
    }
  }
  else {
    angle = csbend->angle;
    e1    = csbend->e[csbend->e1Index];
    e2    = csbend->e[csbend->e2Index];
    etilt = csbend->etilt*csbend->etiltSign;
    tilt  = csbend->tilt;
    rho0  = csbend->length/angle;
  }

  setupMultApertureData(&apertureData, -tilt, apContour, maxamp, apFileData, z_start+csbend->length/2);

  if (fabs(rho0)>1e6) {
    if (csbend->k2!=0)
      bombElegant("Error: One or more CSBENDs have radius > 1e6 but non-zero K2. Best to convert this to KQUSE or KSEXT.\n", NULL);
    if (csbend->k1!=0) {
      ELEMENT_LIST elem;
      KQUAD kquad;
      static short largeRhoWarningK1 = 0;
      if (!largeRhoWarningK1) {
#if USE_MPI
	if (myid==1)
	  dup2(fd,fileno(stdout)); /* Let the first slave processor write the output */
#endif
        printf("Warning: One or more CSBENDs have radius > 1e6 but non-zero K1.  Treated as KQUAD.\n");
        printf("*** All higher multipoles are ignored for these elements!\n");
        largeRhoWarningK1 = 1;
#if USE_MPI
	if (myid==1) {
#if defined(_WIN32)
	  freopen("NUL","w",stdout); 
#else
	  freopen("/dev/null","w",stdout); 
#endif
	}
#endif  
      }
      memset(&elem, 0, sizeof(elem));
      memset(&kquad, 0, sizeof(kquad));
      elem.p_elem = (void*)&kquad;
      elem.type = T_KQUAD;
      kquad.length = csbend->length;
      kquad.k1 = csbend->k1;
      kquad.tilt = csbend->tilt+csbend->etilt*csbend->etiltSign;
      kquad.dx = csbend->dx;
      kquad.dy = csbend->dy;
      kquad.dz = csbend->dz;
      kquad.synch_rad = csbend->synch_rad;
      kquad.isr = csbend->isr;
      kquad.isr1Particle = csbend->isr1Particle;
      kquad.nSlices = csbend->nSlices;
      kquad.integration_order = csbend->integration_order;
      return multipole_tracking2(part, n_part, &elem, p_error, Po, accepted, z_start, maxamp, NULL, apFileData, sigmaDelta2, -1);
    } else {
      if (!largeRhoWarning) {
#if USE_MPI
	if (myid==1)
	  dup2(fd,fileno(stdout)); /* Let the first slave processor write the output */
#endif
        printf("Warning: One or more CSBENDs have radius > 1e6.  Treated as EDRIFT.\n");
        printf("*** All higher multipoles are ignored for these elements!\n");
#if USE_MPI
	if (myid==1) {
#if defined(_WIN32)
	  freopen("NUL","w",stdout); 
#else
	  freopen("/dev/null","w",stdout); 
#endif
	}
#endif  
        largeRhoWarning = 1;
      }
      exactDrift(part, n_part, csbend->length);
      return n_part;
    }
  }

  fse = csbend->fse + csbend->fseDipole + (csbend->fseCorrection?csbend->fseCorrectionValue:0);
  h = 1/rho0;
  n = -csbend->b[1]/h;
  if (fse>-1)
    rho_actual = 1/((1+fse)*h);
  else
    rho_actual = 1e16/h;
  /*
  if (1) {
      TRACKING_CONTEXT tcontext;
      getTrackingContext(&tcontext);
      printf("Tracking %s#%ld: FSE=%le, FSE(User)=%le, FSE(Correction)=%le\n",
             tcontext.elementName, tcontext.elementOccurrence, fse, csbend->fse, csbend->fseCorrectionValue);
  }
  */

  e1_kick_limit = csbend->edge_kick_limit[csbend->e1Index];
  e2_kick_limit = csbend->edge_kick_limit[csbend->e2Index];
  if (csbend->kick_limit_scaling) {
    e1_kick_limit *= rho0/rho_actual;
    e2_kick_limit *= rho0/rho_actual;
  }
  if (e1_kick_limit>0 || e2_kick_limit>0) {
    printf("rho0=%e  rho_a=%e fse=%e e1_kick_limit=%e e2_kick_limit=%e\n",
            rho0, rho_actual, csbend->fse, e1_kick_limit, e2_kick_limit);
    fflush(stdout);
  }
  /* angles for fringe-field effects */
  Kg1  = 2*csbend->hgap*(csbend->fint[csbend->e1Index]>=0 ? csbend->fint[csbend->e1Index] : csbend->fintBoth)*SIGN(rho0);
  psi1 = Kg1/fabs(rho_actual)/cos(e1)*(1+sqr(sin(e1)));
  Kg2  = 2*csbend->hgap*(csbend->fint[csbend->e2Index]>=0 ? csbend->fint[csbend->e2Index] : csbend->fintBoth)*SIGN(rho0);
  psi2 = Kg2/fabs(rho_actual)/cos(e2)*(1+sqr(sin(e2)));
  if (csbend->length<0) {
    psi1 *=  -1;
    psi2 *=  -1;
  }

  /* rad_coef is d((P-Po)/Po)/ds for the on-axis, on-momentum particle, where po is the momentum of
   * the central particle.
   */
  if (csbend->synch_rad)
    rad_coef = sqr(particleCharge)*pow3(Po)*sqr(1+fse)/(6*PI*epsilon_o*sqr(c_mks)*particleMass*sqr(rho0));
  else
    rad_coef = 0;
  /* isrConstant is the RMS increase in dP/P per meter due to incoherent SR.  */
  isrConstant = particleRadius*sqrt(55.0/(24*sqrt(3))*pow5(Po)*137.0359895/pow3(fabs(rho_actual)));
  if (!csbend->isr || (csbend->isr1Particle==0 && n_part==1))
    /* Minus sign here indicates that we accumulate ISR into sigmaDelta^2 but don't apply it to particles. */
    isrConstant *= -1; 

  if ((distributionBasedRadiation = csbend->distributionBasedRadiation)) {
    /* Sands 5.15 */
    meanPhotonsPerRadian0 = 5.0/(2.0*sqrt(3))*Po/137.0359895;  
    meanPhotonsPerMeter0 = (5*c_mks*Po*particleMass*particleRadius)/(2*sqrt(3)*hbar_mks*fabs(rho_actual));
    /* Critical energy normalized to reference energy, Sands 5.9 */
    normalizedCriticalEnergy0 = 3.0/2*hbar_mks*c_mks*pow3(Po)/fabs(rho_actual)/(Po*particleMass*sqr(c_mks));
    /* fprintf(stderr, "Mean photons per radian expected: %le   ECritical/E: %le\n", 
            meanPhotonsPerRadian0, normalizedCriticalEnergy0);
    */
    includeOpeningAngle = csbend->includeOpeningAngle;
  }
  
  computeCSBENDFieldCoefficients(csbend->b, csbend->c, h, csbend->nonlinear, csbend->expansionOrder);

  ttilt = tilt + etilt;
  if (ttilt==0) {
    cos_ttilt = 1;
    sin_ttilt = 0;
  }
  else if (fabs(fabs(ttilt)-PI)<1e-12) {
    cos_ttilt = -1;
    sin_ttilt = 0;
  }
  else if (fabs(ttilt-PIo2)<1e-12) {
    cos_ttilt = 0;
    sin_ttilt = 1;
  }
  else if (fabs(ttilt+PIo2)<1e-12) {
    cos_ttilt = 0;
    sin_ttilt = -1;
  }
  else {
    cos_ttilt = cos(ttilt);
    sin_ttilt = sin(ttilt);
  }

  dxi = dyi = dzi = 0;
  dxf = dyf = dzf = 0;
  if (csbend->malignMethod==0) {
    computeEtiltCentroidOffset(dcoord_etilt, rho0, angle, etilt, tilt);
    
    dxi = -csbend->dx;
    dzi =  csbend->dz;
    dyi = -csbend->dy;
    
    /* must use the original angle here because the translation is done after
     * the final rotation back
     */
    dxf = csbend->dx*cos(csbend->angle) + csbend->dz*sin(csbend->angle);
    dzf = csbend->dx*sin(csbend->angle) - csbend->dz*cos(csbend->angle);
    dyf = csbend->dy;
  } else {
    if (iSlice<=0) {
      if (csbend->malignMethod==1)
        offsetParticlesForEntranceCenteredMisalignmentExact
          (part, n_part, 
           csbend->dx, csbend->dy, csbend->dz, 
           csbend->epitch, csbend->eyaw, csbend->etilt, tilt, angle, csbend->length, 1);
      else 
        offsetParticlesForBodyCenteredMisalignmentExact
          (part, n_part, 
           csbend->dx, csbend->dy, csbend->dz, 
           csbend->epitch, csbend->eyaw, csbend->etilt, tilt, angle, csbend->length, 1);
    }
  }
  
  i_top = n_part-1;
#if !defined(PARALLEL)
  multipoleKicksDone += n_part*csbend->nSlices*(csbend->integration_order==4?4:1);
#endif

  if (sigmaDelta2)
    *sigmaDelta2 = 0;

  for (i_part=0; i_part<=i_top; i_part++) {
    if (!part) {
      printf("error: null particle array found (working on particle %ld) (track_through_csbend)\n", i_part);
      fflush(stdout);
      abort();
    }
    if (!(coord = part[i_part])) {
      printf("error: null coordinate pointer for particle %ld (track_through_csbend)\n", i_part);
      fflush(stdout);
      abort();
    }
    if (accepted && !accepted[i_part]) {
      printf("error: null accepted particle pointer for particle %ld (track_through_csbend)\n", i_part);
      fflush(stdout);
      abort();
    }

    if (csbend->malignMethod==0 && iSlice<=0) {
      coord[4] += dzi*sqrt(1 + sqr(coord[1]) + sqr(coord[3]));
      coord[0]  = coord[0] + dxi + dzi*coord[1];
      coord[2]  = coord[2] + dyi + dzi*coord[3];
      
      x  =  coord[0]*cos_ttilt + coord[2]*sin_ttilt;
      y  = -coord[0]*sin_ttilt + coord[2]*cos_ttilt;
      xp =  coord[1]*cos_ttilt + coord[3]*sin_ttilt;
      yp = -coord[1]*sin_ttilt + coord[3]*cos_ttilt;
      s  = coord[4];
      dp = dp0 = coord[5];
    } else {
      x  = coord[0];
      y  = coord[2];
      xp = coord[1];
      yp = coord[3];
      s  = coord[4];
      dp = dp0 = coord[5];
    }
    
    if (iSlice<=0) {
      if (csbend->edgeFlags&BEND_EDGE1_EFFECTS) {
        if (csbend->edge_order<=1 && csbend->edge_effects[csbend->e1Index]==1) {
          /* apply edge focusing, nonsymplectic method */
          rho = (1+dp)*rho_actual;
          delta_xp = tan(e1)/rho*x;
          if (e1_kick_limit>0 && fabs(delta_xp)>e1_kick_limit)
            delta_xp = SIGN(delta_xp)*e1_kick_limit;
          xp += delta_xp;
          yp -= tan(e1-psi1/(1+dp))/rho*y;
        } else if (csbend->edge_order>=2 && csbend->edge_effects[csbend->e1Index]==1) {
          /* apply edge focusing, nonsymplectic method */
          rho = (1+dp)*rho_actual;
          apply_edge_effects(&x, &xp, &y, &yp, rho, n, e1, he1, psi1*(1+dp), -1);
        } else if (csbend->edge_effects[csbend->e1Index]==2) {
          /* K. Hwang's approach */
          /* load input coordinates into arrays */
          Qi[0] = x;  Qi[1] = xp;  Qi[2] = y;  Qi[3] = yp;  Qi[4] = 0;  Qi[5] = dp;
          convertToDipoleCanonicalCoordinates(Qi, csbend->expandHamiltonian);
          dipoleFringeKHwang(Qf, Qi, rho_actual, -1., csbend->edge_order, csbend->b[1]/rho0, e1, 2*csbend->hgap, 
                             csbend->fint[csbend->e1Index]>=0?csbend->fint[csbend->e1Index]:csbend->fintBoth,
                             csbend->h[csbend->e1Index]);
          /* retrieve coordinates from arrays */
          convertFromDipoleCanonicalCoordinates(Qf, csbend->expandHamiltonian);
          x  = Qf[0];  
          xp = Qf[1];  
          y  = Qf[2];  
          yp = Qf[3];  
          dp = Qf[5];
        } else if (csbend->edge_effects[csbend->e1Index]==3) {
          /* simple-minded symplectic approach */
          applySimpleDipoleEdgeKick(&xp, &yp, x, y, dp, rho_actual, e1, psi1, e1_kick_limit, csbend->expandHamiltonian);
        } else if (csbend->edge_effects[csbend->e1Index]==4) {
          /* K. Hwang's approach as symplectified by R. Lindberg */
          /* load input coordinates into arrays */
          Qi[0] = x;  Qi[1] = xp;  Qi[2] = y;  Qi[3] = yp;  Qi[4] = 0;  Qi[5] = dp;
          convertToDipoleCanonicalCoordinates(Qi, csbend->expandHamiltonian);
          dipoleFringeKHwangRLindberg(Qf, Qi, rho_actual, -1., csbend->b[1]/rho0, e1,
                                      2*csbend->hgap,
				    csbend->fint[csbend->e1Index]>=0?csbend->fint[csbend->e1Index]:csbend->fintBoth,
                                      csbend->h[csbend->e1Index]);
          /* retrieve coordinates from arrays */
          convertFromDipoleCanonicalCoordinates(Qf, csbend->expandHamiltonian);
          x  = Qf[0];  
          xp = Qf[1];  
          y  = Qf[2];  
          yp = Qf[3];  
          dp = Qf[5];
        }
      }
    }

    /* load input coordinates into arrays */
    Qi[0] = x;  Qi[1] = xp;  Qi[2] = y;  Qi[3] = yp;  Qi[4] = 0;  Qi[5] = dp;

    if (iSlice<=0) {
      if (csbend->edgeFlags&BEND_EDGE1_EFFECTS && e1!=0 && rad_coef) {
        /* pre-adjust dp/p to anticipate error made by integrating over entire sector */
        computeCSBENDFields(&Fx, &Fy, x, y);
        
        dp_prime = -rad_coef*(sqr(Fx)+sqr(Fy))*sqr(1+dp)*sqrt(sqr(1+x/rho0)+sqr(xp)+sqr(yp));
        Qi[5] -= dp_prime*x*tan(e1);
      }

      convertToDipoleCanonicalCoordinates(Qi, csbend->expandHamiltonian);
    }
    
    if (csbend->expandHamiltonian)
      particle_lost = !integrate_csbend_ordn_expanded(Qf, Qi, sigmaDelta2, csbend->length, csbend->nSlices, iSlice, rho0, Po, &dz_lost,
                                                      &apertureData, csbend->integration_order);
    else
      particle_lost = !integrate_csbend_ordn(Qf, Qi, sigmaDelta2, csbend->length, csbend->nSlices, iSlice, rho0, Po, &dz_lost,
                                             &apertureData, csbend->integration_order);

    if (iSlice<0 || iSlice==(csbend->nSlices-1) || particle_lost) {
      if (csbend->fseCorrection==1)
        Qf[4] -= csbend->fseCorrectionPathError;
      convertFromDipoleCanonicalCoordinates(Qf, csbend->expandHamiltonian);
    }

    if (particle_lost) {
      if (!part[i_top]) {
        printf("error: couldn't swap particles %ld and %ld--latter is null pointer (track_through_csbend)\n",
                i_part, i_top);
        fflush(stdout);
        abort();
      }
      memcpy(part[i_part],  Qf, sizeof(part[i_part][0])*6);
      convertFromCSBendCoords(part+i_part, 1, rho0, cos_ttilt, sin_ttilt, 0);
      swapParticles(part[i_part], part[i_top]);
      if (accepted) {
        if (!accepted[i_top]) {
          printf(
                  "error: couldn't swap acceptance data for particles %ld and %ld--latter is null pointer (track_through_csbend)\n",
                  i_part, i_top);
          fflush(stdout);
          abort();
        }
        swapParticles(accepted[i_part], accepted[i_top]);
      }
      part[i_top][4] = z_start + dz_lost;
      part[i_top][5] = Po*(1+part[i_top][5]);

      i_top--;
      i_part--;
      continue;
    }

    if (iSlice<0 || iSlice==(csbend->nSlices-1)) {
      if (csbend->edgeFlags&BEND_EDGE2_EFFECTS && e2!=0 && rad_coef) {
        /* post-adjust dp/p to correct error made by integrating over entire sector */
        x = Qf[0];
        xp = Qf[1];
        y = Qf[2];
        yp = Qf[3];
        dp = Qf[5];
        
        computeCSBENDFields(&Fx, &Fy, x, y);
        
        dp_prime = -rad_coef*(sqr(Fx)+sqr(Fy))*sqr(1+dp)*sqrt(sqr(1+x/rho0)+sqr(xp)+sqr(yp));
        Qf[5] -= dp_prime*x*tan(e2);
      }

      /* get final coordinates */
      if (rad_coef || isrConstant) {
        double p0, p1;
        double beta0, beta1;
        /* fix previous distance information to reflect new velocity--since distance
         * is really time-of-flight at the current velocity 
         */
        p0 = Po*(1+dp0);
        beta0 = p0/sqrt(sqr(p0)+1);
        p1 = Po*(1+Qf[5]);
        beta1 = p1/sqrt(sqr(p1)+1);
        s = beta1*s/beta0 + Qf[4];
      }
      else
        s += Qf[4];
    } else
        s += Qf[4];
    x = Qf[0];  xp = Qf[1];  y = Qf[2];  yp = Qf[3];  dp = Qf[5];

    if (iSlice<0 || iSlice==(csbend->nSlices-1)) {
      if (csbend->edgeFlags&BEND_EDGE2_EFFECTS) {
        /* apply edge focusing */
        if (csbend->edge_order<=1 && csbend->edge_effects[csbend->e2Index]==1) {
          rho = (1+dp)*rho_actual;
          delta_xp = tan(e2)/rho*x;
          if (e2_kick_limit>0 && fabs(delta_xp)>e2_kick_limit)
            delta_xp = SIGN(delta_xp)*e2_kick_limit;
          xp += delta_xp;
          yp -= tan(e2-psi2/(1+dp))/rho*y;
        } else if (csbend->edge_order>=2 && csbend->edge_effects[csbend->e2Index]==1) {
          rho = (1+dp)*rho_actual;
          apply_edge_effects(&x, &xp, &y, &yp, rho, n, e2, he2, psi2*(1+dp), 1);
        } else if (csbend->edge_effects[csbend->e2Index]==2) {
          /* load input coordinates into arrays */
          Qi[0] = x;  Qi[1] = xp;  Qi[2] = y;  Qi[3] = yp;  Qi[4] = 0;  Qi[5] = dp;
          convertToDipoleCanonicalCoordinates(Qi, csbend->expandHamiltonian);
          dipoleFringeKHwang(Qf, Qi, rho_actual, 1., csbend->edge_order, csbend->b[1]/rho0, e2, 2*csbend->hgap, 
                             csbend->fint[csbend->e2Index]>=0?csbend->fint[csbend->e2Index]:csbend->fintBoth, 
                             csbend->h[csbend->e2Index]);
          /* retrieve coordinates from arrays */
          convertFromDipoleCanonicalCoordinates(Qf, csbend->expandHamiltonian);
          x  = Qf[0];  
          xp = Qf[1];  
          y  = Qf[2];  
          yp = Qf[3];  
          dp = Qf[5];
        } else if (csbend->edge_effects[csbend->e2Index]==3) {
          applySimpleDipoleEdgeKick(&xp, &yp, x, y, dp, rho_actual, e2, psi2, e2_kick_limit, csbend->expandHamiltonian);
        } else if (csbend->edge_effects[csbend->e2Index]==4) {
          /* K. Hwang's approach as symplectified by R. Lindberg */
          /* load input coordinates into arrays */
          Qi[0] = x;  Qi[1] = xp;  Qi[2] = y;  Qi[3] = yp;  Qi[4] = 0;  Qi[5] = dp;
          convertToDipoleCanonicalCoordinates(Qi, csbend->expandHamiltonian);
          dipoleFringeKHwangRLindberg(Qf, Qi, rho_actual, 1., csbend->b[1]/rho0, e2, 2*csbend->hgap, 
                                      csbend->fint[csbend->e2Index]>=0?csbend->fint[csbend->e2Index]:csbend->fintBoth, 
                                      csbend->h[csbend->e2Index]);
          /* retrieve coordinates from arrays */
          convertFromDipoleCanonicalCoordinates(Qf, csbend->expandHamiltonian);
          x  = Qf[0];  
          xp = Qf[1];  
          y  = Qf[2];  
          yp = Qf[3];  
          dp = Qf[5];
        }
      }
    }

    if (csbend->malignMethod==0 && (iSlice<0 || iSlice==(csbend->nSlices-1))) {
      coord[0] =  x*cos_ttilt -  y*sin_ttilt + dcoord_etilt[0];
      coord[2] =  x*sin_ttilt +  y*cos_ttilt + dcoord_etilt[2];
      coord[1] = xp*cos_ttilt - yp*sin_ttilt + dcoord_etilt[1];
      coord[3] = xp*sin_ttilt + yp*cos_ttilt + dcoord_etilt[3];
      coord[4] = s + dcoord_etilt[4];
      coord[5] = dp;
      
      coord[0] += dxf + dzf*coord[1];
      coord[2] += dyf + dzf*coord[3];
      coord[4] += dzf*sqrt(1+ sqr(coord[1]) + sqr(coord[3]));
    } else {
      coord[0] = x;
      coord[2] = y;
      coord[1] = xp;
      coord[3] = yp;
      coord[4] = s;
      coord[5] = dp;
    }
  }

  if (iSlice<0 || iSlice==(csbend->nSlices-1)) {
    if (csbend->malignMethod!=0) {
      if (csbend->malignMethod==1)
        offsetParticlesForEntranceCenteredMisalignmentExact
          (part, n_part, 
           csbend->dx, csbend->dy, csbend->dz, 
           csbend->epitch, csbend->eyaw, csbend->etilt, tilt, angle, csbend->length, 2);
      else 
        offsetParticlesForBodyCenteredMisalignmentExact
          (part, n_part, 
           csbend->dx, csbend->dy, csbend->dz, 
           csbend->epitch, csbend->eyaw, csbend->etilt, tilt, angle, csbend->length, 2);
    }
  }

  if (distributionBasedRadiation) {
    radiansTotal += fabs(csbend->angle);
    /*
      fprintf(stderr, "%e radians, photons/particle=%e, photons/radian = %e, mean y = %e\n",
      radiansTotal, photonCount/(1.0*i_top), photonCount/radiansTotal/(1.0*i_top), energyCount/photonCount);
    */
    distributionBasedRadiation = 0;
  }

  /*
  for (i_part=i_top+1; i_part<n_part; i_part++) 
      fprintf(fpdeb, "%le %le\n", part[i_part][0], part[i_part][2]);
  */

  if (sigmaDelta2)
    /* Return average value for all particles */
    *sigmaDelta2 /= i_top+1;

  if (csbend->photonOutputFile && !SDDS_UpdatePage(csbend->SDDSphotons, FLUSH_TABLE))
      SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
  
  return(i_top+1);
}

void convertToDipoleCanonicalCoordinates(double *Qi, long expanded)
{
  double f;
  if (expanded)
    f = (1 + Qi[5]);
  else
    f = (1 + Qi[5])/sqrt(1 + sqr(Qi[1]) + sqr(Qi[3]));
  Qi[1] *= f;
  Qi[3] *= f;
}

void convertFromDipoleCanonicalCoordinates(double *Qi, long expanded)
{
  double f;
  if (expanded) 
    f = 1/(1+Qi[5]);
  else
    f = 1/sqrt(sqr(1+Qi[5])-sqr(Qi[1])-sqr(Qi[3]));
  Qi[1] *= f;
  Qi[3] *= f;
}



  /* BETA is 2^(1/3) */
#define BETA 1.25992104989487316477



long integrate_csbend_ordn
(
 double *Qf,                  /* final coordinates */
 double *Qi,                  /* initial coordinates */
 double *sigmaDelta2,         /* accumulate the energy spread increase for propagation of radiation matrix */
 double s,                    /* arc length */
 long n,                      /* number of slices */
 long iSlice,                 /* If <0, integrate the full magnet. If >=0, integrate just a single part and return.               
                               * This is needed to allow propagation of the radiation matrix. */
 double rho0,                 /* nominal bending radius */
 double p0,                   /* central momentum */
 double *dz_lost,             /* return of loss position */
 MULT_APERTURE_DATA *apData,  /* aperture data */
 short integration_order      /* 2, 4, or 6 */
)
{
  long i;
  double factor, f, phi, ds, dsh, dist;
  double Fx, Fy, x, y;
  double sine, cosi, tang;
  double sin_phi, cos_phi;
  //APCONTOUR *apContour;

  static double driftFrac2[2] = {
    0.5, 0.5
  };
  static double kickFrac2[2] = {
    1.0, 0.0
  };

  static double driftFrac4[4] = {
    0.5/(2-BETA),  (1-BETA)/(2-BETA)/2,  (1-BETA)/(2-BETA)/2,  0.5/(2-BETA)
  } ;
  static double kickFrac4[4] = {
    1./(2-BETA),  -BETA/(2-BETA),  1/(2-BETA),  0
  } ;

  /* From AOP-TN-2020-064 */
  static double driftFrac6[8] = {
    0.39225680523878, 0.5100434119184585, -0.47105338540975655, 0.0687531682525181,
    0.0687531682525181, -0.47105338540975655, 0.5100434119184585, 0.39225680523878,
  } ;
  static double kickFrac6[8] = {
    0.784513610477560, 0.235573213359357, -1.17767998417887, 1.3151863206839063,
    -1.17767998417887,  0.235573213359357, 0.784513610477560, 0
  } ;

  double *driftFrac = NULL, *kickFrac = NULL;
  long nSubsteps = 0;
  switch (integration_order) {
  case 2:
    nSubsteps = 2;
    driftFrac = driftFrac2;
    kickFrac = kickFrac2;
    break;
  case 4:
    nSubsteps = 4;
    driftFrac = driftFrac4;
    kickFrac = kickFrac4;
    break;
  case 6:
    nSubsteps = 8;
    driftFrac = driftFrac6;
    kickFrac = kickFrac6;
    break;
  default:
    bombElegantVA("invalid order %ld given for symplectic integrator", integration_order);
    break;
  }
  
#define X0 Qi[0]
#define XP0 Qi[1]
#define Y0 Qi[2]
#define YP0 Qi[3]
#define S0 Qi[4]
#define DPoP0 Qi[5]

#define X Qf[0]
#define QX Qf[1]
#define Y Qf[2]
#define QY Qf[3]
#define S Qf[4]
#define DPoP Qf[5]

  if (refTrajectoryMode && refTrajectoryPoints!=n)
    bombElegant("Problem with recorded reference trajectory for CSBEND element---has wrong number of points\n", NULL);
  if (!Qf)
    bombElegant("NULL final coordinates pointer ()", NULL);
  if (!Qi)
    bombElegant("NULL initial coordinates pointer (integrate_csbend_ordn)", NULL);
  if (n<1)
    bombElegant("invalid number of steps (integrate_csbend_ordn)", NULL);

  memcpy(Qf, Qi, sizeof(*Qi)*6);

  /*
  if (apData)
    apContour = apData->apContour;
  */

  dist = 0;
  s /= n;
  *dz_lost = 0; /* we'll accumulate this value even if the particle isn't lost */
  for (i=0; i<n; i++) {
    long j;
    if ((apData && !checkMultAperture(X, Y, apData)) ||
        insideObstruction(Qf, GLOBAL_LOCAL_MODE_SEG, 0.0, i, n)) {
      return 0;
    }
    for (j=0; j<nSubsteps; j++) {
      /* do drift */
      dsh = s*driftFrac[j];
      if ((f=sqr(1+DPoP)-sqr(QY))<=0) {
        return 0;
      }
      f = sqrt(f);
      if (fabs(QX/f)>1) {
        return 0;
      }
      phi = asin(sin_phi=QX/f);
      sine = sin(dsh/rho0+phi);
      if ((cosi = cos(dsh/rho0+phi))==0) {
        return 0;
      }
      tang = sine/cosi;
      cos_phi = cos(phi);
      QX = f*sine;
      Y += QY*(factor=(rho0+X)*cos_phi/f*(tang-sin_phi/cos_phi));
      dist += factor*(1+DPoP);
      *dz_lost += dsh;
      f = cos_phi/cosi;
      X  = rho0*(f-1) + f*X;
      if (apData && !checkMultAperture(X, Y, apData)) {
        return 0;
      }

      if (kickFrac[j]==0)
        break;
      /* do kick */
      ds = s*kickFrac[j];
      /* -- calculate the scaled fields */
      x = X;
      y = Y;
      
      computeCSBENDFields(&Fx, &Fy, x, y);
      
      /* --do kicks */
      QX += -ds*(1+X/rho0)*Fy/rho_actual;
      QY += ds*(1+X/rho0)*Fx/rho_actual;
      if (rad_coef || isrConstant) {
        addRadiationKick(&QX, &QY, &DPoP, sigmaDelta2, 
                         X, Y, (i+1./3)*s, s*n, 1./rho0, Fx, Fy, 
                         ds, rad_coef, s/(nSubsteps-1), isrConstant,
                         distributionBasedRadiation, includeOpeningAngle,
                         meanPhotonsPerMeter0, normalizedCriticalEnergy0, p0);
      }
    }

    if (refTrajectoryMode==RECORD_TRAJECTORY) {
      refTrajectoryData[i][0] = X;
      refTrajectoryData[i][1] = QX;
      refTrajectoryData[i][2] = Y;
      refTrajectoryData[i][3] = QY;
      refTrajectoryData[i][4] = dist - s;
      X = QX = Y = QY = dist = 0;
    }
    if (refTrajectoryMode==SUBTRACT_TRAJECTORY) {
      X -= refTrajectoryData[i][0];
      QX -= refTrajectoryData[i][1];
      Y -= refTrajectoryData[i][2];
      QY -= refTrajectoryData[i][3];
      dist -= refTrajectoryData[i][4];
    }
    if (iSlice>=0)
      break;
  }
  if ((apData && !checkMultAperture(X, Y, apData)) ||
      insideObstruction(Qf, GLOBAL_LOCAL_MODE_SEG, 0.0, i, n)) {
    *dz_lost = n*s;
    return 0;
  }

  Qf[4] += dist;
  return 1;
}


long integrate_csbend_ordn_expanded(double *Qf, double *Qi, double *sigmaDelta2, double s, long n, long iSlice, double rho0, double p0,
                                    double *dz_lost, MULT_APERTURE_DATA *apData, short integration_order)
/* The Hamiltonian in this case is approximated as
 * H = Hd + Hf, where Hd is the drift part and Hf is the field part.
 * Hd = Hd1 + Hd2 + Hd1, where
 * Hd1 = -0.5*(1+x/rho0)*(1+delta) 
 * Hd2 = 0.5*(qx^2+qy^2)/(1+delta)
 */
{
  long i;
  double ds, dsh, dist;
  double Fx, Fy, x, y;

  static double driftFrac2[2] = {
    0.5, 0.5
  };
  static double kickFrac2[2] = {
    1.0, 0.0
  };

  static double driftFrac4[4] = {
    0.5/(2-BETA),  (1-BETA)/(2-BETA)/2,  (1-BETA)/(2-BETA)/2,  0.5/(2-BETA)
  } ;
  static double kickFrac4[4] = {
    1./(2-BETA),  -BETA/(2-BETA),  1/(2-BETA),  0
  } ;

  /* From AOP-TN-2020-064 */
  static double driftFrac6[8] = {
    0.39225680523878, 0.5100434119184585, -0.47105338540975655, 0.0687531682525181,
    0.0687531682525181, -0.47105338540975655, 0.5100434119184585, 0.39225680523878,
  } ;
  static double kickFrac6[8] = {
    0.784513610477560, 0.235573213359357, -1.17767998417887, 1.3151863206839063,
    -1.17767998417887,  0.235573213359357, 0.784513610477560, 0
  } ;

  double *driftFrac = NULL, *kickFrac = NULL;
  long nSubsteps = 0;
  switch (integration_order) {
  case 2:
    nSubsteps = 2;
    driftFrac = driftFrac2;
    kickFrac = kickFrac2;
    break;
  case 4:
    nSubsteps = 4;
    driftFrac = driftFrac4;
    kickFrac = kickFrac4;
    break;
  case 6:
    nSubsteps = 8;
    driftFrac = driftFrac6;
    kickFrac = kickFrac6;
    break;
  default:
    bombElegantVA("invalid order %ld given for symplectic integrator", integration_order);
    break;
  }
#define X0 Qi[0]
#define XP0 Qi[1]
#define Y0 Qi[2]
#define YP0 Qi[3]
#define S0 Qi[4]
#define DPoP0 Qi[5]

#define X Qf[0]
#define QX Qf[1]
#define Y Qf[2]
#define QY Qf[3]
#define S Qf[4]
#define DPoP Qf[5]

  if (refTrajectoryMode && refTrajectoryPoints!=n)
    bombElegant("Problem with recorded reference trajectory for CSBEND element---has wrong number of points\n", NULL);
  if (!Qf)
    bombElegant("NULL final coordinates pointer ()", NULL);
  if (!Qi)
    bombElegant("NULL initial coordinates pointer (integrate_csbend_ordn)", NULL);
  if (n<1)
    bombElegant("invalid number of steps (integrate_csbend_ordn)", NULL);

  memcpy(Qf, Qi, sizeof(*Qi)*6);

  dist = 0;
  s /= n;
  *dz_lost = 0; /* we'll accumulate this value even if the particle isn't lost */
  for (i=0; i<n; i++) {
    long j;
    if ((apData && !checkMultAperture(X, Y, apData)) ||
        insideObstruction(Qf, GLOBAL_LOCAL_MODE_SEG, 0.0, i, n)) {
      return 0;
    }
    for (j=0; j<nSubsteps; j++) {
      /* do drift */
      dsh = s*driftFrac[j];
      QX += dsh*(1+DPoP)/(2*rho0);
      dist += dsh*(1 + (sqr(QX)+sqr(QY))/2);
      *dz_lost += dsh;
      X += QX*dsh/(1+DPoP);
      Y += QY*dsh/(1+DPoP);
      QX += dsh*(1+DPoP)/(2*rho0);
      
      if (apData && !checkMultAperture(X, Y, apData)) {
        return 0;
      }
      
      if (kickFrac[j]==0)
        break;
      /* do kick */
      ds = s*kickFrac[j];
      /* -- calculate the scaled fields */
      x = X;
      y = Y;
      
      computeCSBENDFields(&Fx, &Fy, x, y);
      
      /* --do kicks */
      QX += -ds*(1+X/rho0)*Fy/rho_actual;
      QY += ds*(1+X/rho0)*Fx/rho_actual;
      if (rad_coef || isrConstant) {
        addRadiationKick(&QX, &QY, &DPoP, sigmaDelta2, 
                         X, Y, (i+1./3)*s, s*n, 1./rho0, Fx, Fy, 
                         ds, rad_coef, s/3, isrConstant,
                         distributionBasedRadiation, includeOpeningAngle,
                         meanPhotonsPerMeter0, normalizedCriticalEnergy0, p0);
      }
    }
  
    if (refTrajectoryMode==RECORD_TRAJECTORY) {
      refTrajectoryData[i][0] = X;
      refTrajectoryData[i][1] = QX;
      refTrajectoryData[i][2] = Y;
      refTrajectoryData[i][3] = QY;
      refTrajectoryData[i][4] = dist - s;
      X = QX = Y = QY = dist = 0;
    }
    if (refTrajectoryMode==SUBTRACT_TRAJECTORY) {
      X -= refTrajectoryData[i][0];
      QX -= refTrajectoryData[i][1];
      Y -= refTrajectoryData[i][2];
      QY -= refTrajectoryData[i][3];
      dist -= refTrajectoryData[i][4];
    }
    if (iSlice>=0)
      break;
  }
  if ((apData && !checkMultAperture(X, Y, apData)) ||
      insideObstruction(Qf, GLOBAL_LOCAL_MODE_SEG, 0.0, i, n)) {
    *dz_lost = n*s;
    return 0;
  }

  Qf[4] += dist;
  return 1;
}

CSR_LAST_WAKE csrWake;

static char *derbenevCriterionOption[N_DERBENEV_CRITERION_OPTIONS] = {
  "disable", "evaluate", "enforce"};

void readWakeFilterFile(long *values, double **freq, double **real, double **imag, 
                        char *freqName, char *realName, char *imagName,
                        char *filename);

long track_through_csbendCSR(double **part, long n_part, CSRCSBEND *csbend, double p_error, 
                             double Po, double **accepted, double z_start, double z_end,
                             CHARGE *charge, char *rootname, MAXAMP *maxamp, APCONTOUR *apContour,
                             APERTURE_DATA *apFileData)
{
  double h, n, he1, he2;
  static long csrWarning = 0;
  static double *beta0=NULL, *ctHist=NULL, *ctHistDeriv=NULL;
  static double *dGamma=NULL, *T1=NULL, *T2=NULL, *denom=NULL, *chik=NULL, *grnk=NULL;
  static long maxParticles = 0, maxBins = 0 ;
  char particleLost;
  double x=0, xp, y=0, yp, p1, beta1, p0;
  double ctLower, ctUpper, dct, slippageLength, phiBend, slippageLength13;
  long diSlippage, diSlippage4;
  long nBins, nBinned = 0;
  long i_part, i_top, kick, j;
  double rho=0.0, Fx, Fy;
  double fse, dp_prime;
  double tilt, etilt, cos_ttilt, sin_ttilt, ttilt;
  double *coord;
  double angle, e1, e2, Kg;
  double psi1, psi2;
  double Qi[6], Qf[6];
  double dcoord_etilt[6];
  double dxi, dyi, dzi;
  double dxf, dyf, dzf;
  double delta_xp;
  double macroParticleCharge, CSRConstant, gamma2, gamma3;
  long iBin, iBinBehind;
  long csrInhibit = 0, largeRhoWarning = 0;
  double derbenevRatio = 0;
  long n_partMoreThanOne = 0;
  TRACKING_CONTEXT tContext;
  VMATRIX *Msection=NULL, *Me1=NULL, *Me2=NULL;
  static double accumulatedAngle = 0;
  short accumulatingAngle = 1;
  double dz_lost=0;
  MULT_APERTURE_DATA apertureData;
#if USE_MPI
  double *buffer;  
#endif
#ifdef DEBUG_IGF
  FILE *fpdeb;
  fpdeb = fopen("csr.sdds","w");
  fprintf(fpdeb, "SDDS1\n&parameter name = Slice, type=long &end\n");
  fprintf(fpdeb, "&column name=s, type=double, units=m &end\n");
  fprintf(fpdeb, "&column name=iBin, type=long &end\n");
  fprintf(fpdeb, "&column name=Chi, type=double &end\n");
  fprintf(fpdeb, "&column name=G, units=V/m, type=double &end\n");
  fprintf(fpdeb, "&column name=dGamma, type=double &end\n");
  fprintf(fpdeb, "&data mode=ascii &end\n");
#endif

#ifdef HAVE_GPU
  if(getElementOnGpu()){
    startGpuTimer();
    i_part = gpu_track_through_csbendCSR(n_part, csbend, p_error, Po, accepted, z_start,  z_end, charge, rootname, maxamp, apContour, apFileData);
#ifdef GPU_VERIFY     
    startCpuTimer();
    /* Copy the csrWake global struct (it is reset below) */
    CSR_LAST_WAKE gpuCsrWake;
    memcpy(&gpuCsrWake, &csrWake, sizeof(CSR_LAST_WAKE));
    csrWake.FdNorm = NULL; /* Reset doesn't deallocate */
    csrWake.StupakovFileActive = 0; /* Reset doesn't close */

    track_through_csbendCSR(part, n_part, csbend, p_error, Po, accepted, z_start, z_end, charge, rootname, maxamp, apContour, apFileData);
    compareGpuCpu(n_part, "track_through_csbendCSR");

    /* compare CSR_LAST_WAKE structs */
    compareCSR_LAST_WAKE(&gpuCsrWake, &csrWake);
    /* Deallocate gpuCsrWake */
    if (gpuCsrWake.FdNorm) {
      free(gpuCsrWake.FdNorm);
      free(gpuCsrWake.xSaldin);
    }
    if (gpuCsrWake.StupakovFileActive)
      if (!SDDS_Terminate(&gpuCsrWake.SDDS_Stupakov))
        bombElegant("problem terminating data file for Stupakov output from CSRDRIFT", NULL);
#endif /* GPU_VERIFY */
    return i_part;
  }
#endif /* HAVE_GPU */
 
  gamma2 = Po*Po+1;
  gamma3 = pow(gamma2, 3./2);

#if USE_MPI 
  if (notSinglePart)
    n_partMoreThanOne = 1; /* This is necessary to solve synchronization issue in parallel version*/
  else
    if (n_part > 1) n_partMoreThanOne = 1;	
#else
  if (n_part > 1) n_partMoreThanOne = 1;
#endif

  if (!(csbend->edgeFlags&SAME_BEND_PRECEDES))
    accumulatedAngle = accumulatingAngle = 0;
  
  csrWake.valid = 0;
  refTrajectoryMode = 0;
  if (isSlave || !notSinglePart) 
    reset_driftCSR();

  getTrackingContext(&tContext);
  
  if (!csbend)
    bombElegant("null CSRCSBEND pointer (track_through_csbend)", NULL);
  if (csbend->integratedGreensFunction && !csbend->steadyState) 
    bombElegant("CSRCSBEND requires STEADYSTATE=1 if IGF=1.", NULL);
  if (csbend->edge_order>1 && (csbend->edge_effects[csbend->e1Index]==2 || csbend->edge_effects[csbend->e2Index]==2) && csbend->hgap==0)
    bombElegant("CSRCSBEND has EDGE_ORDER>1 and EDGE[12]_EFFECTS==2, but HGAP=0. This gives undefined results.", NULL);

  if (csbend->angle==0) {
    if (!csbend->useMatrix)
      exactDrift(part, n_part, csbend->length); 
    else {
      long i;
      if (isSlave || !notSinglePart) {
        for (i=0; i<n_part; i++) {
          part[i][0] += csbend->length*part[i][1];
          part[i][2] += csbend->length*part[i][3];
          part[i][4] += csbend->length;
        }
      }
    }
    return n_part;
  }

  if (csbend->integration_order!=2 && csbend->integration_order!=4  && csbend->integration_order!=6)
    bombElegant("CSBEND integration_order is invalid--must be either 2, 4, or 6", NULL);

  macroParticleCharge = 0;
  if (charge) {
    macroParticleCharge = charge->macroParticleCharge;
  } else if (csbend->bins && !csrWarning && csbend->csr) {
    printf("Warning: you asked for CSR on CSBEND but didn't give a CHARGE element\n");
    fflush(stdout);
    csrWarning = 1;
  }
  
  if ((nBins=csbend->bins)<2)
    bombElegant("Less than 2 bins for CSR!", NULL);

  if (csbend->SGDerivHalfWidth<=0)
    csbend->SGDerivHalfWidth = csbend->SGHalfWidth;
  if (csbend->SGDerivHalfWidth<=0)
    csbend->SGDerivHalfWidth = 1;

  if (csbend->SGDerivOrder<=0)
    csbend->SGDerivOrder = csbend->SGOrder;
  if (csbend->SGDerivOrder<=0)
    csbend->SGDerivOrder = 1;
  
  if (isSlave || !notSinglePart) 
    if (n_part>maxParticles &&
	(!(beta0=SDDS_Realloc(beta0, sizeof(*beta0)*(maxParticles=n_part)))))
      bombElegant("Memory allocation failure (track_through_csbendCSR)", NULL);

  rho0 = csbend->length/csbend->angle;
  if (csbend->use_bn) {
    csbend->b[0] = 0;
    csbend->b[1] = csbend->b1;
    csbend->b[2] = csbend->b2;
    csbend->b[3] = csbend->b3;
    csbend->b[4] = csbend->b4;
    csbend->b[5] = csbend->b5;
    csbend->b[6] = csbend->b6;
    csbend->b[7] = csbend->b7;
    csbend->b[8] = csbend->b8;
  } else {
    csbend->b[0] = 0;
    csbend->b[1] = csbend->k1*rho0;
    csbend->b[2] = csbend->k2*rho0;
    csbend->b[3] = csbend->k3*rho0;
    csbend->b[4] = csbend->k4*rho0;
    csbend->b[5] = csbend->k5*rho0;
    csbend->b[6] = csbend->k6*rho0;
    csbend->b[7] = csbend->k7*rho0;
    csbend->b[8] = csbend->k8*rho0;
  }
  for (j=0; j<9; j++)
    csbend->c[j] = 0;

  he1 = csbend->h[csbend->e1Index];
  he2 = csbend->h[csbend->e2Index];
  if (csbend->angle<0) {
    long i;
    angle = -csbend->angle;
    e1    = -csbend->e[csbend->e1Index];
    e2    = -csbend->e[csbend->e2Index];
    etilt = csbend->etilt*csbend->etiltSign;
    tilt  = csbend->tilt + PI;
    rho0  = csbend->length/angle;
    for (i=1; i<9; i+=2) {
      csbend->b[i] *= -1;
      csbend->c[i] *= -1;
    }
  }
  else {
    angle = csbend->angle;
    e1    = csbend->e[csbend->e1Index];
    e2    = csbend->e[csbend->e2Index];
    etilt = csbend->etilt*csbend->etiltSign;
    tilt  = csbend->tilt;
    rho0  = csbend->length/angle;
  }

  setupMultApertureData(&apertureData, -tilt, apContour, maxamp, apFileData, z_start+csbend->length/2);

  if (rho0>1e6) {
    if (!largeRhoWarning) {
      printf("Warning: One or more CSRCSBENDs have radius > 1e6.  Treated as drift.\n");
      largeRhoWarning = 1;
    }
    exactDrift(part, n_part, csbend->length);
    return n_part;
  }
  
  h = 1/rho0;
  n = -csbend->b[1]/h;
  fse = csbend->fse;
  if (fse>-1)
    rho_actual = 1/((1+fse)*h);
  else
    rho_actual = 1e16/h;

  /* angles for fringe-field effects */
  Kg   = 2*csbend->hgap*csbend->fint;
  psi1 = Kg/rho_actual/cos(e1)*(1+sqr(sin(e1)));
  psi2 = Kg/rho_actual/cos(e2)*(1+sqr(sin(e2)));
  if (csbend->length<0) {
    psi1 *=  -1;
    psi2 *=  -1;
  }

  /* rad_coef is d((P-Po)/Po)/ds for the on-axis, on-momentum particle, where po is the momentum of
   * the central particle.
   */
  if (csbend->synch_rad)
    rad_coef = sqr(particleCharge)*pow3(Po)*sqr(1+fse)/(6*PI*epsilon_o*sqr(c_mks)*particleMass*sqr(rho0));
  else
    rad_coef = 0;
  /* isrConstant is the RMS increase in dP/P per meter due to incoherent SR.  */
  if (csbend->isr && (n_part>1 || !csbend->isr1Particle)) 
    isrConstant = particleRadius*sqrt(55.0/(24*sqrt(3))*pow5(Po)*
                              137.0359895/pow3(fabs(rho_actual)));
  else
    isrConstant = 0;

  distributionBasedRadiation = 0;
  
  if (csbend->useMatrix) {
    csbend->nonlinear = 0;
    Me1 = edge_matrix(e1, 1./(rho0/(1+csbend->fse)), 0.0, n, -1, Kg, 1, 0, 0, csbend->length);
    Msection = bend_matrix(csbend->length/csbend->nSlices, 
                           angle/csbend->nSlices, 0.0, 0.0, 
                           0.0, 0.0, csbend->b[1]*h,  0.0,
                           0.0, 0.0, 0.0, 0.0, csbend->fse, 0.0, 0.0, 
                           csbend->etilt*csbend->etiltSign, 1, 1, 0, 0, 0.0, 0.0);
    Me2 = edge_matrix(e2, 1./(rho0/(1+csbend->fse)), 0.0, n, 1, Kg, 1, 0, 0, csbend->length);
  }
  computeCSBENDFieldCoefficients(csbend->b, csbend->c, h, csbend->nonlinear, csbend->expansionOrder);

  ttilt = tilt + etilt;
  if (ttilt==0) {
    cos_ttilt = 1;
    sin_ttilt = 0;
  }
  else if (fabs(fabs(ttilt)-PI)<1e-12) {
    cos_ttilt = -1;
    sin_ttilt = 0;
  }
  else if (fabs(ttilt-PIo2)<1e-12) {
    cos_ttilt = 0;
    sin_ttilt = 1;
  }
  else if (fabs(ttilt+PIo2)<1e-12) {
    cos_ttilt = 0;
    sin_ttilt = -1;
  }
  else {
    cos_ttilt = cos(ttilt);
    sin_ttilt = sin(ttilt);
  }


  if (etilt)
    computeEtiltCentroidOffset(dcoord_etilt, rho0, angle, etilt, tilt);
  else
    fill_double_array(dcoord_etilt, 6L, 0.0);

  dxi = -csbend->dx;
  dzi =  csbend->dz;
  dyi = -csbend->dy;

  /* must use the original angle here because the translation is done after
   * the final rotation back
   */
  dxf =  csbend->dx*cos(csbend->angle) + csbend->dz*sin(csbend->angle);
  dzf =  csbend->dx*sin(csbend->angle) - csbend->dz*cos(csbend->angle);
  dyf = csbend->dy;

  if (isMaster) {
  if (csbend->particleOutputFile && strlen(csbend->particleOutputFile) && !csbend->particleFileActive) {
    /* set up SDDS output file for particle coordinates inside bend */
    csbend->particleFileActive = 1;
    csbend->particleOutputFile = compose_filename(csbend->particleOutputFile, rootname);
    csbend->SDDSpart = tmalloc(sizeof(*(csbend->SDDSpart)));
    if (!SDDS_InitializeOutputElegant(csbend->SDDSpart, SDDS_BINARY, 1, 
                               NULL, NULL, csbend->particleOutputFile) ||
        0>SDDS_DefineParameter(csbend->SDDSpart, "SVNVersion", NULL, NULL, "SVN version number", NULL, SDDS_STRING, SVN_VERSION) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSpart, "Pass", NULL, SDDS_LONG) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSpart, "Kick", NULL, SDDS_LONG) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSpart, "pCentral", "m$be$nc", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSpart, "Angle", NULL, SDDS_DOUBLE) ||
        (csbend->xIndex=SDDS_DefineColumn(csbend->SDDSpart, "x", NULL, "m", 
                                          NULL, NULL, SDDS_DOUBLE, 0 ))<0 ||
        (csbend->xpIndex=SDDS_DefineColumn(csbend->SDDSpart, "xp", NULL, NULL, 
                                           NULL, NULL, SDDS_DOUBLE, 0))<0 ||
        (csbend->tIndex=SDDS_DefineColumn(csbend->SDDSpart, "t", NULL, "s", 
                                          NULL, NULL, SDDS_DOUBLE, 0))<0 ||
        (csbend->pIndex=SDDS_DefineColumn(csbend->SDDSpart, "p", NULL, "m$be$nc", 
                                          NULL, NULL, SDDS_DOUBLE, 0))<0 ||
        !SDDS_WriteLayout(csbend->SDDSpart)) {
      SDDS_SetError("Problem setting up particle output file for CSR");
      SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
    }
  }
  }
  
  if (isMaster) { 
  if (csbend->histogramFile && strlen(csbend->histogramFile) && !csbend->wakeFileActive) {
    /* set up SDDS output file for CSR monitoring */
    csbend->wakeFileActive = 1;
    csbend->histogramFile = compose_filename(csbend->histogramFile, rootname);
    csbend->SDDSout = tmalloc(sizeof(*(csbend->SDDSout)));
    if (!SDDS_InitializeOutputElegant(csbend->SDDSout, SDDS_BINARY, 1, NULL, NULL, csbend->histogramFile) ||
        0>SDDS_DefineParameter(csbend->SDDSout, "SVNVersion", NULL, NULL, "SVN version number", NULL, SDDS_STRING, SVN_VERSION) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSout, "Pass", NULL, SDDS_LONG) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSout, "Kick", NULL, SDDS_LONG) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSout, "pCentral", "m$be$nc", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSout, "Angle", NULL, SDDS_DOUBLE) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSout, "SlippageLength", "m", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSout, "TotalBunchLength", "m", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSout, "BinSize", "m", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSout, "dsKick", "m", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleParameter(csbend->SDDSout, "DerbenevRatio", NULL, SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSout, "s", "m", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSout, "LinearDensity", "C/s", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSout, "LinearDensityDeriv", "C/s$a2$n", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSout, "DeltaGamma", NULL, SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSout, "GammaDeriv", "1/m", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSout, "DeltaGammaT1", NULL, SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSout, "DeltaGammaT2", NULL, SDDS_DOUBLE) ||
        !SDDS_WriteLayout(csbend->SDDSout)) {
      SDDS_SetError("Problem setting up wake output file for CSR");
      SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
    }
  }
  }
  if (csbend->wakeFilterFile && strlen(csbend->wakeFilterFile) && !csbend->wffValues) 
    readWakeFilterFile(&csbend->wffValues,
                       &csbend->wffFreqValue, &csbend->wffRealFactor, &csbend->wffImagFactor, 
                       csbend->wffFreqColumn, csbend->wffRealColumn, csbend->wffImagColumn,
                       csbend->wakeFilterFile);
  
  /*  prepare arrays for CSR integrals */
  nBins = csbend->bins;
  if (!(ctHist=SDDS_Realloc(ctHist, sizeof(*ctHist)*nBins)) ||
      !(ctHistDeriv=SDDS_Realloc(ctHistDeriv, sizeof(*ctHistDeriv)*nBins)) ||
      !(denom=SDDS_Realloc(denom, sizeof(*denom)*nBins)) ||
      !(T1=SDDS_Realloc(T1, sizeof(*T1)*nBins)) ||
      !(T2=SDDS_Realloc(T2, sizeof(*T2)*nBins)) ||
      !(dGamma=SDDS_Realloc(dGamma, sizeof(*dGamma)*nBins)))
    bombElegant("memory allocation failure (track_through_csbendCSR)", NULL);

  
  
  /* prepare some data for CSRDRIFT */
  csrWake.dGamma = dGamma;
  csrWake.bins = nBins;
  csrWake.ds0 = csbend->length/csbend->nSlices;
  csrWake.zLast = csrWake.z0 = z_end;
  csrWake.highFrequencyCutoff0 = csbend->highFrequencyCutoff0;
  csrWake.highFrequencyCutoff1 = csbend->highFrequencyCutoff1;
  csrWake.lowFrequencyCutoff0 = csbend->lowFrequencyCutoff0;
  csrWake.lowFrequencyCutoff1 = csbend->lowFrequencyCutoff1;
  csrWake.clipNegativeBins = csbend->clipNegativeBins;
  csrWake.wffValues = csbend->wffValues;
  csrWake.wffFreqValue = csbend->wffFreqValue;
  csrWake.wffRealFactor = csbend->wffRealFactor;
  csrWake.wffImagFactor = csbend->wffImagFactor;
  
#if !defined(PARALLEL)  
  multipoleKicksDone += n_part*csbend->nSlices*(csbend->integration_order==4?4:1);
#endif

  if (isSlave || !notSinglePart) {
  /* check particle data, transform coordinates, and handle edge effects */
  for (i_part=0; i_part<n_part; i_part++) {
    if (!part) {
      printf("error: null particle array found (working on particle %ld) (track_through_csbend)\n", i_part);
      fflush(stdout);
      abort();
    }
    if (!(coord = part[i_part])) {
      printf("error: null coordinate pointer for particle %ld (track_through_csbend)\n", i_part);
      fflush(stdout);
      abort();
    }
    if (accepted && !accepted[i_part]) {
      printf("error: null accepted particle pointer for particle %ld (track_through_csbend)\n", i_part);
      fflush(stdout);
      abort();
    }

    /* adjust for element offsets */
    coord[4] += dzi*sqrt(1 + sqr(coord[1]) + sqr(coord[3]));
    coord[0]  = coord[0] + dxi + dzi*coord[1];
    coord[2]  = coord[2] + dyi + dzi*coord[3];

    /* perform tilt transformations and save some data */
    x  =  coord[0]*cos_ttilt + coord[2]*sin_ttilt;
    y  = -coord[0]*sin_ttilt + coord[2]*cos_ttilt;
    coord[0] = x;
    coord[2] = y;
    xp =  coord[1]*cos_ttilt + coord[3]*sin_ttilt;
    yp = -coord[1]*sin_ttilt + coord[3]*cos_ttilt;
    coord[1] = xp;
    coord[3] = yp;
    p0 = Po*(1+coord[5]);
    beta0[i_part] = p0/sqrt(p0*p0+1);
    coord[4] /= beta0[i_part];

#undef X
#undef Y
#define X coord[0]
#define Y coord[2]
#define XP coord[1]
#define YP coord[3]
#define CT coord[4]
#define DP coord[5]
    if (csbend->edgeFlags&BEND_EDGE1_EFFECTS) {
      /* apply edge focusing */
      if (csbend->useMatrix)
        track_particles(&coord, Me1, &coord, 1);
      else {
        rho = (1+DP)*rho_actual;
        if (csbend->edge_order<=1 && csbend->edge_effects[csbend->e1Index]==1) {
          /* apply edge focusing, nonsymplectic method */
          delta_xp = tan(e1)/rho*X;
          XP += delta_xp;
          YP -= tan(e1-psi1/(1+DP))/rho*Y;
        } else if (csbend->edge_order>=2 && csbend->edge_effects[csbend->e1Index]==1) 
          apply_edge_effects(&X, &XP, &Y, &YP, rho, n, e1, he1, psi1*(1+DP), -1);
        else if (csbend->edge_effects[csbend->e1Index]==2) {
          rho = (1+DP)*rho_actual;
	  /* load input coordinates into arrays */
	  Qi[0] = X;
	  Qi[1] = XP;
	  Qi[2] = Y;
	  Qi[3] = YP;
	  Qi[4] = 0;  
	  Qi[5] = DP;
          convertToDipoleCanonicalCoordinates(Qi, 0);
          dipoleFringeKHwang(Qf, Qi, rho_actual, -1., csbend->edge_order, csbend->b[1]/rho0, e1, 2*csbend->hgap, csbend->fint, csbend->h[csbend->e1Index]);
	  /* retrieve coordinates from arrays */
          convertFromDipoleCanonicalCoordinates(Qf, 0);
	  X  = Qf[0];  
	  XP = Qf[1];  
	  Y  = Qf[2];  
	  YP = Qf[3];  
	  DP = Qf[5];
        } else if (csbend->edge_effects[csbend->e1Index]==3) {
          applySimpleDipoleEdgeKick(&XP, &YP, X, Y, DP, rho_actual, e1, psi1, -1.0, 0);
        }
      }
    }

    if (csbend->edgeFlags&BEND_EDGE1_EFFECTS && e1!=0 && rad_coef) {
      /* pre-adjust dp/p to anticipate error made by integrating over entire sector */
      computeCSBENDFields(&Fx, &Fy, X, Y);

      dp_prime = -rad_coef*(sqr(Fx)+sqr(Fy))*sqr(1+DP)*
        sqrt(sqr(1+X/rho0)+sqr(XP)+sqr(YP));
      DP -= dp_prime*X*tan(e1);
    }
  }
  }
  if (csbend->csr && n_partMoreThanOne)
    CSRConstant = 2*macroParticleCharge*particleCharge/pow(3*rho0*rho0, 1./3.)/(4*PI*epsilon_o*particleMass*sqr(c_mks));
  else
    CSRConstant = 0;
  /* Now do the body of the sector dipole */
  phiBend = accumulatedAngle; 
  i_top = n_part - 1;
  for (kick=0; kick<(csbend->nSlices+1); kick++) {
    if (!csbend->backtrack && kick==csbend->nSlices)
      break;
    if (isSlave || !notSinglePart) {
      if (!csbend->backtrack || kick!=0) {
        for (i_part=0; i_part<=i_top; i_part++) {
          coord = part[i_part];
          
          if (csbend->useMatrix) {
            track_particles(&coord, Msection, &coord, 1);
          } else {
            /* load input coordinates into arrays */
            Qi[0] = X;
            Qi[1] = XP;
            Qi[2] = Y;
            Qi[3] = YP;
            Qi[4] = 0;  
            Qi[5] = DP;
            convertToDipoleCanonicalCoordinates(Qi, 0);
            
            particleLost = !integrate_csbend_ordn(Qf, Qi, NULL, csbend->length/csbend->nSlices, 1, -1, rho0, Po, &dz_lost, &apertureData, csbend->integration_order);

            /* retrieve coordinates from arrays */
            convertFromDipoleCanonicalCoordinates(Qf, 0);
            X  = Qf[0];  
            XP = Qf[1];  
            Y  = Qf[2];  
            YP = Qf[3];  
            DP = Qf[5];
            
            if (particleLost) {
              if (!part[i_top]) {
                printf("error: couldn't swap particles %ld and %ld--latter is null pointer (track_through_csbend)\n",
                       i_part, i_top);
                fflush(stdout);
                abort();
              }
              memcpy(part[i_part],  Qf, sizeof(part[i_part][0])*6);
              convertFromCSBendCoords(part+i_part, 1, rho0, cos_ttilt, sin_ttilt, 0);
              swapParticles(part[i_part], part[i_top]);
              if (accepted) {
                if (!accepted[i_top]) {
                  printf(
                         "error: couldn't swap acceptance data for particles %ld and %ld--latter is null pointer (track_through_csbend)\n",
                         i_part, i_top);
                  fflush(stdout);
                  abort();
                }
                swapParticles(accepted[i_part], accepted[i_top]);
              }
              part[i_top][4] = z_start + dz_lost;
              part[i_top][5] = Po*(1+part[i_top][5]);
              
              i_top--;
              i_part--;
            } else {
              if (rad_coef || isrConstant) {
                /* convert additional distance traveled to ct using mean velocity */
                p1 = Po*(1+DP);
                beta1 = p1/sqrt(p1*p1+1);
                CT += Qf[4]*2/(beta0[i_part]+beta1);
                beta0[i_part] = beta1;
              } else
                CT += Qf[4]/beta0[i_part];  
            }
          }
        }
        n_part = i_top + 1;
      }
    }

    if (csbend->backtrack && kick==csbend->nSlices)
      break;

    if (n_partMoreThanOne && csbend->derbenevCriterionMode) {
      /* evaluate Derbenev criterion from TESLA-FEL 1995-05: sigma_x/sigma_z << (R/sigma_z)^(1/3) */
      long code;
      double Sz, Sx;
      switch (code=match_string(csbend->derbenevCriterionMode, derbenevCriterionOption, N_DERBENEV_CRITERION_OPTIONS, 0)) {
      case DERBENEV_CRITERION_DISABLE:
	break;
      case DERBENEV_CRITERION_EVAL:
      case DERBENEV_CRITERION_ENFORCE:
#if !USE_MPI
	rms_emittance(part, 4, 5, n_part, &Sz, NULL, NULL, NULL, NULL);
	rms_emittance(part, 0, 1, n_part, &Sx, NULL, NULL, NULL, NULL);
#else
     if (notSinglePart) {
        /* The master will get the result from the rms_emittance routine */
       rms_emittance_p(part, 4, 5, n_part, &Sz, NULL, NULL, NULL, NULL, NULL);
       rms_emittance_p(part, 0, 1, n_part, &Sx, NULL, NULL, NULL, NULL, NULL);
     } else {
       rms_emittance(part, 4, 5, n_part, &Sz, NULL, NULL, NULL, NULL);
       rms_emittance(part, 0, 1, n_part, &Sx, NULL, NULL, NULL, NULL);
     }
#endif
	Sz = sqrt(Sz);
	Sx = sqrt(Sx);
	derbenevRatio = (Sx/Sz)/pow(rho0/Sz, 1./3.);
	if (derbenevRatio>0.1) {
	  if (code==DERBENEV_CRITERION_EVAL)
	    fprintf(stderr, "Warning: Using 1-D CSR formalism but Derbenev criterion not satisfied (%le > 0.1).\n",
		    derbenevRatio);
	  else {
	    csrInhibit = 1;
	    fprintf(stderr, "Warning: Derbenev criterion not satisfied (%le > 0.1)---not applying CSR\n",
		    derbenevRatio);
	  }
	}
	break;
      default:
	fprintf(stderr, "Error: invalid value for DERBENEV_CRITERION_MODE. Give 'disable', 'evaluate', or 'enforce'\n");
	exit(1);
	break;
      }
    }
    

#if (!USE_MPI)
    if (n_partMoreThanOne && !csrInhibit) {
#else
      if (!csrInhibit && (notSinglePart || (!notSinglePart && n_partMoreThanOne))) { /* n_part could be 0 for some processors, which could cause synchronization problem */
#endif
      /* compute CSR potential function */
      if (kick==0 || !csbend->binOnce) {
        /* - first make a density histogram */
        ctLower = ctUpper = dct = 0;
	nBinned = binParticleCoordinate(&ctHist, &maxBins,
                                   &ctLower, &ctUpper, &dct, &nBins, 
                                   csbend->binRangeFactor<1.1?1.1:csbend->binRangeFactor, 
					part, n_part, 4);
#if (!USE_MPI) 
	if (nBinned != n_part) {
          printf("Only %ld of %ld particles binned for CSRCSBEND (z0=%le, kick=%ld, BRF=%le)\n", 
		  nBinned, n_part, z_start, kick, csbend->binRangeFactor<1.1?1.1:csbend->binRangeFactor);
	  printf("ct min, max = %21.15e, %21.15e, dct = %21.15e, nBins=%ld, maxBins=%ld\n",
		  ctLower, ctUpper, dct, nBins, maxBins);
          fflush(stdout);
        }
#else
     if (notSinglePart) {
	if (USE_MPI) {
	  long all_binned, result = 1, nBinned_total;

          if (isSlave || !notSinglePart) {
	    result = ((nBinned==n_part) ? 1 : 0);
	  }
	  MPI_Allreduce(&result, &all_binned, 1, MPI_LONG, MPI_LAND, MPI_COMM_WORLD);
	  MPI_Allreduce(&nBinned, &nBinned_total, 1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD);
          nBinned = nBinned_total; 
	  if (!all_binned && isMaster) {
	    printf("Not all particles binned for CSRCSBEND (z0=%le, kick=%ld, BRF=%le)\n", 
		    z_start, kick,
		  csbend->binRangeFactor<1.1?1.1:csbend->binRangeFactor);
	  printf("ct min, max = %21.15e, %21.15e, dct = %21.15e, nBins=%ld, maxBins=%ld\n",
		  ctLower, ctUpper, dct, nBins, maxBins);
          fflush(stdout);
        }
        }

	if (USE_MPI) {  /* Master needs to know the information to write the result */
	  buffer = malloc(sizeof(double) * nBins);
	  MPI_Allreduce(ctHist, buffer, nBins, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	  memcpy(ctHist, buffer, sizeof(double)*nBins);
	  free(buffer);
	}
     }
#endif
        
        /* - smooth the histogram, normalize to get linear density, and 
           copy in preparation for taking derivative
           */
        if (csbend->highFrequencyCutoff0>0 || csbend->lowFrequencyCutoff0>=0) {
          long nz;
          nz = applyLHPassFilters(ctHist, nBins, 
                                  csbend->lowFrequencyCutoff0, csbend->lowFrequencyCutoff1,
                                  csbend->highFrequencyCutoff0, csbend->highFrequencyCutoff1,
                                  csbend->clipNegativeBins);
          if (nz && negativeWarningsLeft) {
	    printf("Warning: low pass filter resulted in negative values in %ld bins\n", nz);
            if (--negativeWarningsLeft==0)
              printf("         Further warnings will be suppressed for this run.\n");
            fflush(stdout);
          }
        }
        if (csbend->SGHalfWidth>0) {
          SavitzyGolaySmooth(ctHist, nBins, csbend->SGOrder, csbend->SGHalfWidth, csbend->SGHalfWidth,  0);
          correctDistribution(ctHist, nBins, 1.0*nBinned);
        }
        for (iBin=0; iBin<nBins; iBin++) {
          denom[iBin] = pow(dct*iBin, 1./3.);
          ctHistDeriv[iBin] = (ctHist[iBin] /= dct);
        }
        /* - compute derivative with smoothing.  The deriv is w.r.t. index number and
         * I won't scale it now as it will just fall out in the integral 
         */
        SavitzyGolaySmooth(ctHistDeriv, nBins, csbend->SGDerivOrder, 
                           csbend->SGDerivHalfWidth, csbend->SGDerivHalfWidth, 1);
      } else {
        ctLower += rho0*angle/csbend->nSlices;
        ctUpper += rho0*angle/csbend->nSlices;
      }
      
      
      phiBend += angle/csbend->nSlices;
      slippageLength = fabs(rho0*ipow(phiBend, 3)/24.0);
      slippageLength13 = pow(slippageLength, 1./3.);
      diSlippage = slippageLength/dct;
      diSlippage4 = 4*slippageLength/dct;
      if (kick==0 || !csbend->binOnce) {
        if (csbend->integratedGreensFunction) {
          /* Integrated Greens function method */
          double const2;
          double z, xmu, a, b, frac, const1;
          if (kick==0) {
            if (!csbend->steadyState) 
              bombElegant("Must have STEADY_STATE=1 when IGF=1\n", NULL);
            if (!(grnk=SDDS_Realloc(grnk, sizeof(*grnk)*nBins)) ||
                !(chik=SDDS_Realloc(chik, sizeof(*chik)*nBins)))
              bombElegant("memory allocation failure (track_through_csbendCSR)", NULL);
          }
          frac = 9.0/16.0;
          const1 = 6.0-log(27.0/4.0);
          for (iBin=0; iBin<nBins; iBin++) {
            z   = iBin*dct;
            xmu = 3.0*gamma3*z/(2.0*rho0);
            a   = sqrt(xmu*xmu+1.0);
            b   = a+xmu;
            if (xmu < 1e-3) 
              chik[iBin] = frac*const1 + 0.50*ipow(xmu,2)-(7.0/54.0)*ipow(xmu,4)+(140.0/2187.0)*ipow(xmu,6);
            else
              chik[iBin] = frac*( 3.0*( -2.0*xmu*pow(b,1.0/3.0) + pow(b,2.0/3.0) + pow(b,4.0/3.0) ) +
                                 log( pow((1-pow(b,2.0/3.0))/xmu,2)  / (1+pow(b,2.0/3.0)+pow(b,4.0/3.0)) ) );
          }
          const2 = (16.0/27.0)*(particleCharge/(4*PI*epsilon_o))/(gamma2*dct);
          grnk[0] = const2*(chik[1]-chik[0]);
          for (iBin=1; iBin<nBins-1; iBin++)
            grnk[iBin] = const2*(chik[iBin+1] - 2.0*chik[iBin] + chik[iBin-1] );
          grnk[nBins-1] = 0;
        } else {
          for (iBin=0; iBin<nBins; iBin++) {
            double term1, term2;
            long count;
            T1[iBin] = T2[iBin] = 0;
            term1 = term2 = 0;
            if (CSRConstant) {
              if (csbend->steadyState) {
                if (!csbend->integratedGreensFunction) {
                  if (!csbend->trapazoidIntegration) {
                    for (iBinBehind=iBin+1; iBinBehind<nBins; iBinBehind++)
                      T1[iBin] += ctHistDeriv[iBinBehind]/denom[iBinBehind-iBin];
                  }
                  else {
                    if ((iBinBehind=iBin+1)<nBins)
                      term1 = ctHistDeriv[iBinBehind]/denom[iBinBehind-iBin];
                    for (count=0, iBinBehind=iBin+1; iBinBehind<nBins; iBinBehind++, count++)
                      T1[iBin] += (term2=ctHistDeriv[iBinBehind]/denom[iBinBehind-iBin]);
                    if ((iBin+1)<nBins)
                      T1[iBin] += 0.3*sqr(denom[1])*(2*ctHistDeriv[iBin+1]+3*ctHistDeriv[iBin])/dct;
                    if (count>1)
                      T1[iBin] -= (term1+term2)/2;
                  }
                }
              } else {
                /* Transient CSR */
                if (!csbend->trapazoidIntegration) {
                  for (iBinBehind=iBin+1; iBinBehind<=(iBin+diSlippage) && iBinBehind<nBins; iBinBehind++)
                    T1[iBin] += ctHistDeriv[iBinBehind]/denom[iBinBehind-iBin];
                }
                else {
                  if ((iBinBehind = iBin+1)<nBins && iBinBehind<=(iBin+diSlippage))
                    term1 = ctHistDeriv[iBinBehind]/denom[iBinBehind-iBin]/2;
                  for (count=0, iBinBehind = iBin+1; iBinBehind<=(iBin+diSlippage) && iBinBehind<nBins; 
                       count++, iBinBehind++)
                    T1[iBin] += (term2=ctHistDeriv[iBinBehind]/denom[iBinBehind-iBin]);
                  if (diSlippage>0 && (iBin+1)<nBins)
                    T1[iBin] += 0.3*sqr(denom[1])*(2*ctHistDeriv[iBin+1]+3*ctHistDeriv[iBin])/dct;
                  if (count>1)
                    T1[iBin] -= (term1+term2)/2;
                }
                if ((iBin+diSlippage)<nBins)
                  T2[iBin] += ctHist[iBin+diSlippage];
                if ((iBin+diSlippage4)<nBins)
                  T2[iBin] -= ctHist[iBin+diSlippage4];
              }
              /* there is no negative sign here because my derivative is w.r.t. -s
                 in notation of Saldin, et. al. */
              T1[iBin] *= CSRConstant*csbend->length/csbend->nSlices; 
              /* keep the negative sign on this term, which has no derivative */
              T2[iBin] *= -CSRConstant*csbend->length/csbend->nSlices/slippageLength13;
            }
            dGamma[iBin] = T1[iBin]+T2[iBin];
          }
        }
        
	if (csbend->integratedGreensFunction) {
          convolveArrays1(dGamma, nBins, ctHist, grnk);
          for (iBin=0; iBin<nBins; iBin++) 
            dGamma[iBin] *= -macroParticleCharge/(particleMass*sqr(c_mks))*csbend->length/csbend->nSlices;
#ifdef DEBUG_IGF
	  fprintf(fpdeb, "%ld\n%ld\n", kick, nBins);
	  for (iBin=0; iBin<nBins; iBin++)
	    fprintf(fpdeb, "%le %ld %le %le %le\n", iBin*dct, iBin, chik[iBin], grnk[iBin], dGamma[iBin]);
#endif
	}

	if (csbend->wffValues) 
	  applyFilterTable(dGamma, nBins, dct/c_mks, csbend->wffValues, csbend->wffFreqValue,
			   csbend->wffRealFactor, csbend->wffImagFactor);
      }
      if (isSlave || !notSinglePart) {
	if (CSRConstant) {
	  for (i_part=0; i_part<n_part; i_part++) {
	    long nBins1;
            double f;
	    nBins1 = nBins-1;
	    coord = part[i_part];
            /* apply CSR kick */
            iBin = (f=(CT-ctLower)/dct);
            f -= iBin;
            if (iBin>=0 && iBin<nBins1) {
              DP += ((1-f)*dGamma[iBin]+f*dGamma[iBin+1])/Po*(1+X/rho0); 
              /* This code probably should be uncommented, but makes very little difference.
              p1 = Po*(1+DP);
              beta1 = p1/sqrt(p1*p1+1);
              CT *= beta0[i_part]/beta1;
              beta0[i_part] = beta1;
              */
            }
	  }
	}
      }
  
      if (csbend->particleFileActive && kick%csbend->particleOutputInterval==0) {
	if (isMaster) {
        long ip;
        /* dump particle data at this location */
        if (!SDDS_StartPage(csbend->SDDSpart, n_part) ||
            !SDDS_SetParameters(csbend->SDDSpart, SDDS_SET_BY_NAME|SDDS_PASS_BY_VALUE, 
                                "Pass", -1, "Kick", kick, "pCentral", Po, "Angle", phiBend, 
                                NULL))
          SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
	convertFromCSBendCoords(part, n_part, rho0, cos_ttilt, sin_ttilt, 1);
        for (ip=0; ip<n_part; ip++) {
          if (!SDDS_SetRowValues(csbend->SDDSpart, SDDS_SET_BY_INDEX|SDDS_PASS_BY_VALUE,
                                 ip, 
                                 csbend->xIndex, part[ip][0],
                                 csbend->xpIndex, part[ip][1],
                                 csbend->tIndex, part[ip][4],
                                 csbend->pIndex, Po*(1+part[ip][5]),
                                 -1)) 
            SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
        }
	convertToCSBendCoords(part, n_part, rho0, cos_ttilt, sin_ttilt, 1);
        if (!SDDS_WritePage(csbend->SDDSpart))
          SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
        if (!inhibitFileSync)
          SDDS_DoFSync(csbend->SDDSpart);
	}
      }

      if (tContext.sliceAnalysis && tContext.sliceAnalysis->active &&
	  kick!=(csbend->nSlices-1) &&
	  (csbend->sliceAnalysisInterval==0 ||
	   kick%csbend->sliceAnalysisInterval==0)) {
#if (!USE_MPI)
	convertFromCSBendCoords(part, n_part, rho0, cos_ttilt, sin_ttilt, 1);
	performSliceAnalysisOutput(tContext.sliceAnalysis, part, n_part, 
				   0, tContext.step, Po, 
				   macroParticleCharge*n_part,
				   tContext.elementName, 
				   z_start + (kick*(z_end-z_start))/(csbend->nSlices-1),
				   1);
	convertToCSBendCoords(part, n_part, rho0, cos_ttilt, sin_ttilt, 1);
#else 
      if (isMaster) 
	printf ("Pelegant does not support slice analysis output inside an element now.");
    
#endif
      }

      if (csbend->wakeFileActive && 
          ((!csbend->outputLastWakeOnly && kick%csbend->outputInterval==0) ||
           (csbend->outputLastWakeOnly && kick==(csbend->nSlices-1)))) {
        /* scale the linear density and its derivative to get C/s and C/s^2 
         * ctHist is already normalized to dct, but ctHistDeriv requires an additional factor
         */
        for (iBin=0; iBin<nBins; iBin++) {
          ctHist[iBin] *= macroParticleCharge*c_mks;
          ctHistDeriv[iBin] *= macroParticleCharge*sqr(c_mks)/dct;
        }
 
	if (isMaster) {
        if (!SDDS_StartPage(csbend->SDDSout, nBins) ||
            !SDDS_SetColumn(csbend->SDDSout, SDDS_SET_BY_NAME, dGamma, nBins, "DeltaGamma") ||
            !SDDS_SetColumn(csbend->SDDSout, SDDS_SET_BY_NAME, T1, nBins, "DeltaGammaT1") ||
            !SDDS_SetColumn(csbend->SDDSout, SDDS_SET_BY_NAME, T2, nBins, "DeltaGammaT2") ||
            !SDDS_SetColumn(csbend->SDDSout, SDDS_SET_BY_NAME, ctHist, nBins, "LinearDensity") ||
            !SDDS_SetColumn(csbend->SDDSout, SDDS_SET_BY_NAME, ctHistDeriv, nBins, "LinearDensityDeriv") ||
            !SDDS_SetParameters(csbend->SDDSout, SDDS_SET_BY_NAME|SDDS_PASS_BY_VALUE, 
                                "Pass", -1, "Kick", kick, "dsKick", csbend->length/csbend->nSlices,
                                "pCentral", Po, "Angle", phiBend, "SlippageLength", slippageLength,
                                "TotalBunchLength", ctUpper-ctLower,
                                "BinSize", dct, 
                                "DerbenevRatio", derbenevRatio, NULL))
          SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
	}
        if (csbend->binOnce) {
          /* fix these arrays so they can be used again */
          ctHist[iBin] /= macroParticleCharge*c_mks;
          ctHistDeriv[iBin] /= macroParticleCharge*sqr(c_mks)/dct;
        }
        /* use T1 array to output s and T2 to output dGamma/ds */
        for (iBin=0; iBin<nBins; iBin++) {
          T1[iBin] = ctLower-(ctLower+ctUpper)/2.0+dct*(iBin+0.5);
          T2[iBin] = dGamma[iBin]/(csbend->length/csbend->nSlices);
        }
	if (isMaster){
        if (!SDDS_SetColumn(csbend->SDDSout, SDDS_SET_BY_NAME, T1, nBins, "s") ||
            !SDDS_SetColumn(csbend->SDDSout, SDDS_SET_BY_NAME, T2, nBins, "GammaDeriv") ||
            !SDDS_WritePage(csbend->SDDSout))
          SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
        if (!inhibitFileSync)
          SDDS_DoFSync(csbend->SDDSout);
      }
    }
  }
  }

  if (!csbend->binOnce && n_partMoreThanOne && !csrInhibit && !csbend->csrBlock) {
    /* prepare some data for use by CSRDRIFT element */
    csrWake.dctBin = dct;
    ctLower = ctUpper = dct = 0;

    nBinned =  binParticleCoordinate(&ctHist, &maxBins,
                                     &ctLower, &ctUpper, &dct, &nBins, 
                                     csbend->binRangeFactor<1.1?1.1:csbend->binRangeFactor, 
				     part, n_part, 4);
#if (!USE_MPI)
    if (nBinned!=n_part) {
      printf("Only %ld of %ld particles binned for CSRCSBEND (z0=%le, end, BRF=%le)\n", 
	      nBinned, n_part, z_start, csbend->binRangeFactor<1.1?1.1:csbend->binRangeFactor);
      printf("ct min, max = %21.15e, %21.15e, dct = %21.15e, nBins=%ld, maxBins=%ld\n",
	      ctLower, ctUpper, dct, nBins, maxBins);
      fflush(stdout);
    }
#else
    if (USE_MPI && notSinglePart) {
      long all_binned, result = 1, nBinned_total;

      if (isSlave || !notSinglePart) {
	result = ((nBinned==n_part) ? 1 : 0);
      }
      else
	nBinned = 0;
      MPI_Allreduce(&result, &all_binned, 1, MPI_LONG, MPI_LAND, MPI_COMM_WORLD);
      MPI_Allreduce(&nBinned, &nBinned_total, 1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD);
      nBinned = nBinned_total; 
      if (!all_binned && isMaster) {
	printf("Not all particles binned for CSRCSBEND (z0=%le, kick=%ld, BRF=%le)\n", 
		z_start, kick,
	      csbend->binRangeFactor<1.1?1.1:csbend->binRangeFactor);
      printf("ct min, max = %21.15e, %21.15e, dct = %21.15e, nBins=%ld, maxBins=%ld\n",
	      ctLower, ctUpper, dct, nBins, maxBins);
      fflush(stdout);
      }
      if (notSinglePart) {  /* Master needs to know the information to write the result */
	buffer = malloc(sizeof(double) * nBins);
	MPI_Allreduce(ctHist, buffer, nBins, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	memcpy(ctHist, buffer, sizeof(double)*nBins);
	free(buffer);
      }
    }
#endif
    csrWake.s0 = ctLower + dzf;
  } else {
    ctLower = ctUpper = dct = 0;
    csrWake.dctBin = dct;
    csrWake.s0 = ctLower + dzf;
  }
  
  i_top = n_part-1;
  if (isSlave || !notSinglePart) {
    /* handle edge effects, and transform coordinates */    
    for (i_part=0; i_part<=i_top; i_part++) {
      coord = part[i_part];
      if (csbend->edgeFlags&BEND_EDGE2_EFFECTS && e2!=0 && rad_coef) {
	/* post-adjust dp/p to correct error made by integrating over entire sector */
        computeCSBENDFields(&Fx, &Fy, X, Y);
        
	dp_prime = -rad_coef*(sqr(Fx)+sqr(Fy))*sqr(1+DP)*
	  sqrt(sqr(1+X/rho0)+sqr(XP)+sqr(YP));
	DP -= dp_prime*X*tan(e2);
      }

      /* convert CT to distance traveled at final velocity */
      p1 = Po*(1+DP);
      beta1 = p1/sqrt(sqr(p1)+1);
      coord[4] = CT*beta1;

      if (p1<=0) {
	if (!part[i_top]) {
	  printf("error: couldn't swap particles %ld and %ld--latter is null pointer (track_through_csbend)\n",
		  i_part, i_top);
	  fflush(stdout);
	  abort();
	}
	swapParticles(part[i_part], part[i_top]);
	if (accepted) {
	  if (!accepted[i_top]) {
	    printf(
		    "error: couldn't swap acceptance data for particles %ld and %ld--latter is null pointer (track_through_csbend)\n",
		    i_part, i_top);
	    fflush(stdout);
	    abort();
	  }
	  swapParticles(accepted[i_part], accepted[i_top]);
	}
	part[i_top][4] = z_start + dz_lost;
	part[i_top][5] = Po*(1+part[i_top][5]);
	i_top--;
	i_part--;
	continue;
      }

      if (csbend->edgeFlags&BEND_EDGE2_EFFECTS) {
	if (csbend->useMatrix)
	  track_particles(&coord, Me2, &coord, 1);
	else {
	  /* apply edge focusing */
	  rho = (1+DP)*rho_actual;
          if (csbend->edge_order<=1 && csbend->edge_effects[csbend->e2Index]==1) {
	    delta_xp = tan(e2)/rho*X;
	    XP += delta_xp;
	    YP -= tan(e2-psi2/(1+DP))/rho*Y;
          } else if (csbend->edge_order>=2 && csbend->edge_effects[csbend->e2Index]==1)
            apply_edge_effects(&X, &XP, &Y, &YP, rho, n, e2, he2, psi2*(1+DP), 1);
          else if (csbend->edge_effects[csbend->e2Index]==2) {
            rho = (1+DP)*rho_actual;
            /* load input coordinates into arrays */
            Qi[0] = X;
            Qi[1] = XP;
            Qi[2] = Y;
            Qi[3] = YP;
            Qi[4] = 0;  
            Qi[5] = DP;
            convertToDipoleCanonicalCoordinates(Qi, 0);
            dipoleFringeKHwang(Qf, Qi, rho_actual, 1., csbend->edge_order, csbend->b[1]/rho0, e2, 2*csbend->hgap, csbend->fint, csbend->h[csbend->e2Index]);
            /* retrieve coordinates from arrays */
            convertFromDipoleCanonicalCoordinates(Qf, 0);
            X  = Qf[0];  
            XP = Qf[1];  
            Y  = Qf[2];  
            YP = Qf[3];  
            DP = Qf[5];
	  } else if (csbend->edge_effects[csbend->e2Index]==3) {
            applySimpleDipoleEdgeKick(&XP, &YP, X, Y, DP, rho_actual, e2, psi2, -1.0, 0);
          }
        }
      }

      coord = part[i_part];
      x  =  X*cos_ttilt -  Y*sin_ttilt + dcoord_etilt[0];
      y  =  X*sin_ttilt +  Y*cos_ttilt + dcoord_etilt[2];
      xp = XP*cos_ttilt - YP*sin_ttilt + dcoord_etilt[1];
      yp = XP*sin_ttilt + YP*cos_ttilt + dcoord_etilt[3];
      X  = x;
      Y  = y;
      XP = xp;
      YP = yp;
      coord[0] += dxf + dzf*coord[1];
      coord[2] += dyf + dzf*coord[3];
      coord[4] += dzf*sqrt(1+ sqr(coord[1]) + sqr(coord[3])) + dcoord_etilt[4];
    }
    n_part = i_top + 1;
  }

  if (n_partMoreThanOne && !csbend->csrBlock) {
    /* prepare more data for CSRDRIFT */
    int64_t imin, imax;
    double S55;

#if !USE_MPI    
    rms_emittance(part, 0, 1, i_top+1, &csrWake.S11, &csrWake.S12, &csrWake.S22, NULL, NULL);
    rms_emittance(part, 4, 5, i_top+1, &S55, NULL, NULL, NULL, NULL);
#else
    if (notSinglePart) {	
      rms_emittance_p(part, 0, 1, i_top+1, &csrWake.S11, &csrWake.S12, &csrWake.S22, NULL, NULL, NULL);
      rms_emittance_p(part, 4, 5, i_top+1, &S55, NULL, NULL, NULL, NULL, NULL);
    } else {
      rms_emittance(part, 0, 1, i_top+1, &csrWake.S11, &csrWake.S12, &csrWake.S22, NULL, NULL);
      rms_emittance(part, 4, 5, i_top+1, &S55, NULL, NULL, NULL, NULL);
    }
#endif

    csrWake.perc68BunchLength = approximateBeamWidth(0.6826, part, i_top+1, 4)/2;
    csrWake.perc90BunchLength = approximateBeamWidth(0.9, part, i_top+1, 4)/2;
	
    csrWake.rmsBunchLength = sqrt(S55);


#ifdef DEBUG
      fprintf(stderr, "rms bunch length = %le, percentile bunch length (68, 90) = %le, %le\n",
              csrWake.rmsBunchLength, csrWake.perc68BunchLength, csrWake.perc90BunchLength);
#endif
    if (macroParticleCharge) {
      index_min_max(&imin, &imax, csrWake.dGamma, csrWake.bins);
      csrWake.peakToPeakWavelength = 2*fabs(1.0*imax-imin)*dct;
    } else {
      csrWake.peakToPeakWavelength = csrWake.perc68BunchLength;
    }

    csrWake.valid = 1;
    csrWake.rho = rho_actual;
    csrWake.bendingAngle = accumulatingAngle ? fabs(phiBend) : fabs(angle);
    csrWake.Po = Po;
    csrWake.SGOrder = csbend->SGOrder;
    csrWake.SGDerivOrder = csbend->SGDerivOrder;
    csrWake.SGHalfWidth = csbend->SGHalfWidth;
    csrWake.SGDerivHalfWidth = csbend->SGDerivHalfWidth;
    csrWake.GSConstant = CSRConstant*pow(3*rho0*rho0, 1./3.)/2;  /* used for G. Stupakov's drift formulae */
    csrWake.MPCharge = macroParticleCharge;
    csrWake.binRangeFactor = csbend->binRangeFactor;
    csrWake.trapazoidIntegration = csbend->trapazoidIntegration;
    if (csbend->useMatrix) {
      free_matrices(Msection);
      free_matrices(Me1);
      free_matrices(Me2);
      free(Msection);
      free(Me1);
      free(Me2);
      Msection = Me1 = Me2 = NULL;
    }
  }

  if (csbend->csrBlock)
    accumulatedAngle = 0;
  else
    /* accumulate the bending angle just in case the same type of dipole follows */
    accumulatedAngle += fabs(angle);
    
#if defined(MINIMIZE_MEMORY)
  /* leave dGamma out of this because that memory is used by CSRDRIFT */
  free(beta0);
  free(ctHist);
  free(ctHistDeriv);
  free(T1);
  free(T2);
  free(denom);
  if (grnk)
    free(grnk);
  if (chik)
    free(chik);
  beta0 = ctHist = ctHistDeriv = T1 = T2 = denom = NULL;
  maxBins = maxParticles = 0;
#endif

#if (!USE_MPI)
  return(i_top+1);
#else
  if (isSlave || !notSinglePart)
    return(i_top+1);
  else
    return n_part; /* i_top is not defined for master */
#endif 
}
#undef DEBUG_IGF

long binParticleCoordinate(double **hist, long *maxBins,
                           double *lower, double *upper, double *binSize, long *bins,
                           double expansionFactor,
                           double **particleCoord, long nParticles, long coordinateIndex)
{
  long iBin, iParticle, nBinned;
  double value;
  
  if (*binSize<=0 && *bins<1)
    return -1;
  if (*binSize>0 && *bins>1)
    return -2;

  /* if (*lower==*upper)  This condition will be removed */ 
  if (isSlave || !notSinglePart) {
    /* find range of points */
    *upper = -(*lower = DBL_MAX);
    for (iParticle=0; iParticle<nParticles; iParticle++) {
      value = particleCoord[iParticle][coordinateIndex];
      if (value<*lower)
        *lower = value;
      if (value>*upper)
        *upper = value;
    }
  }

#if USE_MPI
  /* find the global maximum and minimum */
  if (notSinglePart) {
    if (isMaster)
      nParticles = 0;
    find_global_min_max(lower, upper, nParticles, MPI_COMM_WORLD);
  }
#endif

    if (expansionFactor>1) {
      double center, range;
      center = (*lower+*upper)/2;
      range = (*upper-*lower)*expansionFactor;
      *lower = center-range/2;
      *upper = center+range/2;
    }
  
  if (*binSize>0)
    /* bin size given, so determine the number of bins */
    *bins = (*upper-*lower)/(*binSize);
  *binSize = (*upper-*lower)/(*bins);

  /* realloc if necessary */
  if (*bins>*maxBins &&
      !(*hist=SDDS_Realloc(*hist, sizeof(**hist)*(*maxBins=*bins))))
    bombElegant("Memory allocation failure (binParticleCoordinate)", NULL);
    
  for (iBin=0; iBin<*bins; iBin++)
    (*hist)[iBin] = 0;
  nBinned = 0;
  if(isSlave || !notSinglePart) {
    for (iParticle=nBinned=0; iParticle<nParticles; iParticle++) {
      /* the coordinate of the bin center is (iBin+0.5)*(*binSize) + *lower */
      iBin = (particleCoord[iParticle][coordinateIndex] - *lower)/(*binSize);
      if (iBin<0 || iBin>(*bins-1))
        continue;
      (*hist)[iBin] += 1;
      nBinned++;
    }
  }
  return nBinned;
}

#if USE_MPI
long binParticleCoordinate_s(double **hist, long *maxBins,
                           double *lower, double *upper, double *binSize, long *bins,
                           double expansionFactor,
                           double **particleCoord, long nParticles, long coordinateIndex)
{
  long iBin, iParticle, nBinned;
  double value;
  
  if (*binSize<=0 && *bins<1)
    return -1;
  if (*binSize>0 && *bins>1)
    return -2;

  /* if (*lower==*upper)  This condition will be removed */ 
  /* find range of points */
  *upper = -(*lower = DBL_MAX);
  for (iParticle=0; iParticle<nParticles; iParticle++) {
    value = particleCoord[iParticle][coordinateIndex];
    if (value<*lower)
      *lower = value;
    if (value>*upper)
      *upper = value;
  }
  if (expansionFactor>1) {
    double center, range;
    center = (*lower+*upper)/2;
    range = (*upper-*lower)*expansionFactor;
    *lower = center-range/2;
    *upper = center+range/2;
  }
  
  if (*binSize>0)
    /* bin size given, so determine the number of bins */
    *bins = (*upper-*lower)/(*binSize);
  *binSize = (*upper-*lower)/(*bins);

  /* realloc if necessary */
  if (*bins>*maxBins &&
      !(*hist=SDDS_Realloc(*hist, sizeof(**hist)*(*maxBins=*bins))))
    bombElegant("Memory allocation failure (binParticleCoordinate)", NULL);
    
  for (iBin=0; iBin<*bins; iBin++)
    (*hist)[iBin] = 0;
  nBinned = 0;
  for (iParticle=nBinned=0; iParticle<nParticles; iParticle++) {
    /* the coordinate of the bin center is (iBin+0.5)*(*binSize) + *lower */
    iBin = (particleCoord[iParticle][coordinateIndex] - *lower)/(*binSize);
    if (iBin<0 || iBin>(*bins-1))
        continue;
    (*hist)[iBin] += 1;
    nBinned++;
  }
  return nBinned;
}
#endif

void computeSaldinFdNorm(double **FdNorm, double **x, long *n, double sMax, long ns,
                         double Po, double radius, double angle, double dx, char *normMode);
long track_through_driftCSR_Stupakov(double **part, long np, CSRDRIFT *csrDrift, 
                                     double Po, double **accepted, double zStart, CHARGE *charge, char *rootname);

long track_through_driftCSR(double **part, long np, CSRDRIFT *csrDrift, 
                            double Po, double **accepted, double zStart, 
			    double revolutionLength, CHARGE *charge, char *rootname)
{
  long iPart, iKick, iBin, binned=0, nKicks, iSpreadMode=0;
  double *coord, p, beta, dz, ct0=0.0, factor, dz0, dzFirst;
  double ctmin, ctmax, spreadFactor, dct;
  double zTravel, attenuationLength, thetaRad=0.0, sigmaZ, overtakingLength, criticalWavelength, wavelength=0.0;
  static char *spreadMode[3] = {"full", "simple", "radiation-only"};
  static char *wavelengthMode[3] = {"sigmaz", "bunchlength", "peak-to-peak"};
  static char *bunchlengthMode[3] = {"rms", "68-percentile", "90-percentile"};
  unsigned long mode;
  static long warned = 0, incrementWarningsLeft=100;
  long nBins1;
  TRACKING_CONTEXT tContext;
#if USE_MPI 
  long np_total=1, np_tmp=np, binned_total;
#endif

  if (csrDrift->LSCBins && !csrDrift->useStupakov)
    bombElegant("LSCBINS is nonzero on CSRDRIFT but USE_STUPAKOV is zero. This is not supported.", NULL);

#ifdef HAVE_GPU
  if(getElementOnGpu()){
#ifdef GPU_VERIFY     
    CSR_LAST_WAKE initCsrWake;
    memcpy(&initCsrWake, &csrWake, sizeof(CSR_LAST_WAKE));
#endif
    startGpuTimer();
    iPart = gpu_track_through_driftCSR(np, csrDrift, Po, accepted, zStart, revolutionLength, charge, rootname);
#ifdef GPU_VERIFY     
    startCpuTimer();
    memcpy(&csrWake, &initCsrWake, sizeof(CSR_LAST_WAKE));
    track_through_driftCSR(part, np, csrDrift, Po, accepted, zStart, revolutionLength, charge, rootname);
    compareGpuCpu(np, "track_through_driftCSR");
#endif /* GPU_VERIFY */
    return iPart;
  }
#endif /* HAVE_GPU */
  
  getTrackingContext(&tContext);

#if (!USE_MPI)
  if (np<=1 || !csrWake.valid || !(csrDrift->csr || csrDrift->LSCBins)) {
#else
  if (notSinglePart){
    if (isMaster) 
      np_tmp = 0;  
    MPI_Allreduce(&np_tmp, &np_total, 1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD);   
  } else
    np_total = np;

  if (np_total<=1 || !csrWake.valid || !(csrDrift->csr || csrDrift->LSCBins)) {
    if (isSlave||!notSinglePart) {
#endif
      if (csrDrift->linearOptics) {
	long i;
	for (i=0; i<np; i++) {
	  part[i][0] += csrDrift->length*part[i][1];
	  part[i][2] += csrDrift->length*part[i][3];
	  part[i][4] += csrDrift->length;
	}
      }
      else
	exactDrift(part, np, csrDrift->length);
#if (USE_MPI)
    }
#endif
    return np;
  }	
  nBins1 = csrWake.bins - 1;

  mode = 
    (csrDrift->spread?CSRDRIFT_SPREAD:0) +
      (csrDrift->useOvertakingLength?CSRDRIFT_OVERTAKINGLENGTH:0) +
        (csrDrift->useSaldin54?CSRDRIFT_SALDIN54:0) +
          (csrDrift->attenuationLength>0?CSRDRIFT_ATTENUATIONLENGTH:0) +
            (csrDrift->useStupakov?CSRDRIFT_STUPAKOV:0) ;
  while ((zStart+1e-12)<csrWake.zLast) {
    if (incrementWarningsLeft) {
      printf("*** Warning: incrementing zStart by revolution length for CSRDRIFT (%s #%ld).\n",
              tContext.elementName, tContext.elementOccurrence);
      printf("    If you are not simulating a ring, this could be a problem!\n");
      incrementWarningsLeft --;
    }
    zStart += revolutionLength;
  }
  if (bitsSet(mode)>1) {
    printf("Error: Too many modes set for CSRDRIFT.\n");
    exitElegant(1);
  }
  if (csrWake.lastMode && csrWake.lastMode!=mode) {
    printf("Error: CSRDRIFT mode changed between dipoles. Pick one mode following each dipole.\n");
    exitElegant(1);
  }
  csrWake.lastMode = mode;
  
  if (mode&CSRDRIFT_STUPAKOV)
    return track_through_driftCSR_Stupakov(part, np, csrDrift, Po, accepted, zStart, charge, rootname);

  if (!warned) {
    printf("Warning: USE_STUPAKOV=1 is recommended for CSRDRIFT elements.\n");
    printf("This is the most physical model available at this time in elegant.\n");
    warned = 1;
  }
  
  dct = csrWake.dctBin;
  if (csrDrift->dz>0) {
    if ((nKicks = csrDrift->length/csrDrift->dz)<1)
      nKicks = 1;
  } else 
    nKicks = csrDrift->nKicks;
  if (nKicks<=0)
    bombElegant("nKicks=0 in CSR drift.", NULL);
  dz = (dz0=csrDrift->length/nKicks)/2;
  
  sigmaZ = 0;
  switch (match_string(csrDrift->bunchlengthMode, bunchlengthMode, 3, 0)) {
  case 0:
    sigmaZ = csrWake.rmsBunchLength;
    break;
  case 1:
    sigmaZ = csrWake.perc68BunchLength;
    break;
  case 2:
    sigmaZ = csrWake.perc90BunchLength;
    break;
  default:
    bombElegant("invalid bunchlength_mode for CSRDRIFT.  Use rms or percentile.", NULL);
  }
  
  overtakingLength = pow(24*sigmaZ*csrWake.rho*csrWake.rho, 1./3.);

  if (mode&CSRDRIFT_OVERTAKINGLENGTH)
    attenuationLength = overtakingLength*csrDrift->overtakingLengthMultiplier;
  else
    attenuationLength = csrDrift->attenuationLength;
  
  if (mode&CSRDRIFT_SPREAD) {
    iSpreadMode = 0;
    if (csrDrift->spreadMode && 
        (iSpreadMode=match_string(csrDrift->spreadMode, spreadMode, 3, 0))<0)
      bombElegant("invalid spread_mode for CSR DRIFT.  Use full, simple, or radiation-only", NULL);
    switch (match_string(csrDrift->wavelengthMode, wavelengthMode, 3, 0)) {
    case 0:
    case 1:
      /* bunch length */
      wavelength = sigmaZ;
      break;
    case 2:
      /* peak-to-peak */
      wavelength = csrWake.peakToPeakWavelength;
      break;
    default:
      bombElegant("invalid wavelength_mode for CSR DRIFT.  Use sigmaz or peak-to-peak", NULL);
      break;
    }
    criticalWavelength = 4.19/ipow(csrWake.Po, 3)*csrWake.rho;
    if (!particleIsElectron)
      bombElegant("CSRDRIFT spread mode is not supported for particles other than electrons", NULL);
    thetaRad = 0.5463e-3/(csrWake.Po*0.511e-3)/pow(criticalWavelength/wavelength, 1./3.);
  }

  if (mode&CSRDRIFT_SALDIN54) {
    if (csrWake.FdNorm==NULL) {
      if (csrDrift->nSaldin54Points<20) 
        csrDrift->nSaldin54Points = 20;
      computeSaldinFdNorm(&csrWake.FdNorm, &csrWake.xSaldin, &csrWake.nSaldin,
                          2*sigmaZ, csrDrift->nSaldin54Points, csrWake.Po, csrWake.rho, csrWake.bendingAngle, dz,
                          csrDrift->normMode);
      if (csrDrift->Saldin54Output)  {
        long ix;
        if (!csrDrift->fpSaldin) {
          csrDrift->Saldin54Output = compose_filename(csrDrift->Saldin54Output, rootname);
          csrDrift->fpSaldin = fopen(csrDrift->Saldin54Output, "w");
          fprintf(csrDrift->fpSaldin, "SDDS1\n&column name=z, type=double &end\n&column name=Factor, type=double &end\n");
          fprintf(csrDrift->fpSaldin, "&data mode=ascii no_row_counts=1 &end\n");
        } else
          fprintf(csrDrift->fpSaldin, "\n");
        for (ix=0; ix<csrWake.nSaldin; ix++) 
          fprintf(csrDrift->fpSaldin, "%le %le\n", csrWake.xSaldin[ix], csrWake.FdNorm[ix]);
        fflush(csrDrift->fpSaldin);
      }
    }
  }

  dzFirst = zStart - csrWake.zLast;
  zTravel = zStart-csrWake.z0;  /* total distance traveled by radiation to reach this point */
#ifdef DEBUG
  printf("CSR in drift:\n");
  printf("zStart = %21.15le, zLast = %21.15le, zTravel = %21.15le\n", zStart, csrWake.zLast,
          zTravel);
  printf("dzFirst = %21.15e, s0 = %21.15e\n", dzFirst, csrWake.s0);
#endif

  for (iKick=0; iKick<nKicks; iKick++) {
    /* first drift is dz=dz0/2, others are dz0 */
    if (iKick==1)
      dz = dz0;
    zTravel += dz;

    ctmin = DBL_MAX;
    ctmax = -DBL_MAX;

    /* propagate particles forward, converting s to c*t=s/beta */
    if (isSlave || !notSinglePart) {
    for (iPart=0; iPart<np; iPart++) {
      coord = part[iPart];
      coord[0] += coord[1]*dz;
      coord[2] += coord[3]*dz;
      p = Po*(1+coord[5]);
      beta = p/sqrt(p*p+1);
      if (csrDrift->linearOptics) 
        coord[4] = (coord[4]+dz)/beta;
      else 
        coord[4] = (coord[4]+dz*sqrt(1+sqr(coord[1])+sqr(coord[3])))/beta;
#ifdef DEBUG
      if (coord[4]>ctmax)
        ctmax = coord[4];
      if (coord[4]<ctmin)
        ctmin = coord[4];
#endif
    }
    }

    factor = 1;
    if (csrWake.dGamma) {
      /* propagate wake forward */
      csrWake.s0 += dz+dzFirst;   /* accumulates position of back end of the radiation pulse */
      ct0 = csrWake.s0;
      
      if (attenuationLength>0) {
        /* attenuate wake */
        if ((factor = exp(-(dz+dzFirst)/attenuationLength))<1) {
          for (iBin=0; iBin<csrWake.bins; iBin++)
            csrWake.dGamma[iBin] *= factor;
        }
      }
      /* factor to account for difference in drift lengths here and in
       * csrcsbend integration.  Use dz0 here because that is the
       * length integrated by each kick.  Add dzFirst to account for any
       * length we may have missed due to intervening non-drift elements.
       */
      factor = (dz0+dzFirst)/csrWake.ds0;
    }
    if (mode&CSRDRIFT_SPREAD) {
      /* compute loss of on-axis field due to spread of beam using a simple-minded
       * computation of beam sizes */
      switch (iSpreadMode) {
      case 0:  /* full */
        factor *= (spreadFactor =
                   sqrt(csrWake.S11/(csrWake.S11 + 
                                     2*zTravel*csrWake.S12 + 
                                     zTravel*zTravel*(sqr(thetaRad)+csrWake.S22))));
        break;
      case 1: /* simple */
        factor *= (spreadFactor =
                   sqrt(csrWake.S11/(csrWake.S11 + zTravel*zTravel*(sqr(thetaRad)+csrWake.S22))));
        break;
      case 2: /* radiation only */
        factor *= (spreadFactor =
                   sqrt(csrWake.S11/(csrWake.S11 + sqr(zTravel*thetaRad))));
        break;
      default:
        bombElegant("invalid spread code---programming error!", NULL);
        break;
      }
    }
    
    if (mode&CSRDRIFT_SALDIN54) {
      long code=0;
      double f0 = 0;
      if (zTravel<=csrWake.xSaldin[csrWake.nSaldin-1]) 
        factor *= (f0=interp(csrWake.FdNorm, csrWake.xSaldin, csrWake.nSaldin, zTravel, 0, 1, &code));
      else 
        factor = 0;
      csrWake.lastFdNorm = f0;
#ifdef DEBUG
      fprintf(csrWake.fpSaldin, "%le %le\n", zTravel, f0);
      fflush(csrWake.fpSaldin);
#endif
      if (!code) {
        fprintf(stderr, "Warning: interpolation failure for Saldin eq. 54\n");
        fprintf(stderr, "zTravel = %le,  csrWake available up to %le\n",
                zTravel, csrWake.xSaldin[csrWake.nSaldin-1]);
        factor = 0;
      }
    }
    
    dzFirst = 0;

    /* apply kick to each particle and convert back to normal coordinates */
    if (isSlave || !notSinglePart) {
    for (iPart=binned=0; iPart<np; iPart++) {
      coord = part[iPart];
      if (csrWake.dGamma) {
        double f;
        iBin = (f=(coord[4]-ct0)/dct);
        f -= iBin;
        if (iBin>=0 && iBin<nBins1) {
          coord[5] += ((1-f)*csrWake.dGamma[iBin]+f*csrWake.dGamma[iBin+1])/Po*factor;
          binned ++;
        }
      }
      p = (1+coord[5])*Po;
      beta = p/sqrt(p*p+1);
      coord[4] = beta*coord[4];
    }
    }
#if USE_MPI
    if (isSlave && notSinglePart) {
      MPI_Allreduce(&binned, &binned_total, 1, MPI_LONG, MPI_SUM, workers);
    }
    if ((myid==1) && (csrWake.dGamma && np_total!=binned_total)) {
      dup2(fd,fileno(stdout)); /* Let the first slave processor write the output */
      printf("only %ld of %ld particles binned for CSR drift %s (track_through_driftCSR)\n",
              binned_total, np_total, tContext.elementName);
#else
    if (csrWake.dGamma && np!=binned) {
      printf("only %ld of %ld particles binned for CSR drift %s (track_through_driftCSR)\n",
              binned, np, tContext.elementName);
#endif
      printf("beam ct min, max = %21.15e, %21.15e\n",
              ctmin, ctmax);
      printf("wake ct0 = %21.15e, ct1 = %21.15e\n",
              ct0, ct0+csrWake.dctBin*csrWake.bins);
      fflush(stdout);
#if USE_MPI
#if defined(_WIN32)
    freopen("NUL","w",stdout); 
#else
    freopen("/dev/null","w",stdout); 
#endif
#endif  
    }
  }
  /* do final drift of dz0/2 */
  dz = dz0/2;
  if (isSlave || !notSinglePart) {
    for (iPart=0; iPart<np; iPart++) {
      coord = part[iPart];
      coord[0] += coord[1]*dz;
      coord[2] += coord[3]*dz;
      if (csrDrift->linearOptics)
	coord[4] += dz;
      else
	coord[4] += dz*sqrt(1+sqr(coord[1])+sqr(coord[3]));
    }    
  }
  csrWake.zLast = zStart+csrDrift->length;
  
  if (csrWake.dGamma) {
    /* propagate wake forward */
    csrWake.s0 += dz;
    ct0 = csrWake.s0;
    
    if (attenuationLength>0) {
      /* attenuate wake */
      if ((factor = exp(-dz/attenuationLength))<1) {
        for (iBin=0; iBin<csrWake.bins; iBin++)
            csrWake.dGamma[iBin] *= factor;
        }
    }
  }

  return np;
}

/* this should be called before starting to track a beamline to make sure that
 * CSR drift elements upstream of all CSRBEND elements get treated like ordinary
 * drift spaces. */

long reset_driftCSR()
{
  csrWake.lastMode = 0;
  if (csrWake.valid && csrWake.FdNorm) {
    printf("Last value of normalization factor for CSR wake was %le\n",
            csrWake.lastFdNorm);
  }
  csrWake.valid = csrWake.bins = 0;
  csrWake.dctBin = csrWake.s0 = csrWake.ds0 = csrWake.zLast =
    csrWake.z0 = csrWake.S11 = csrWake.S12 = csrWake.S22 = 0;
  csrWake.dGamma = NULL;
  csrWake.nSaldin = 0;
  if (csrWake.FdNorm) {
    free(csrWake.FdNorm);
    free(csrWake.xSaldin);
    csrWake.FdNorm = csrWake.xSaldin = NULL;
  }
  if (csrWake.StupakovFileActive) {
    if (!SDDS_Terminate(&csrWake.SDDS_Stupakov))
      bombElegant("problem terminating data file for Stupakov output from CSRDRIFT", NULL);
    csrWake.StupakovFileActive = 0;
  }
  return 1;
}

double SolveForPsiSaldin54(double xh, double sh);
double Saldin5354Factor(double xh, double sh, double phihm, double xhLowerLimit);

void computeSaldinFdNorm(double **FdNorm, double **x, long *n, double sMax, long ns,
                         double Po, double radius, double bendingAngle, double dx, 
                         char *normMode)
{
  double xEnd, sh, beta, gamma, xh, dx0;
  long ix, is;
  double phihs, phihm, xhLowerLimit, xUpperLimit, s, f, fx;
  double t1, t2, f0, fmax;
  char *allowedNormMode[2] = {"first", "peak"};

  gamma = sqrt(sqr(Po)+1);
  beta = Po/gamma;

  if ((xEnd = sMax/(1-beta))>1000 || isnan(xEnd) || isinf(xEnd)) {
    fprintf(stderr, "Warning: the extent of the CSR drift wake decay was limited at 1km\n");
    xEnd = 1000;
  }

  *n = 100;
  dx0 = xEnd/(100*(*n));
  if (dx<dx0) {
    *n = xEnd/(100*dx);
    if (*n>100000) {
      *n = 100000;
      fprintf(stderr, "Note: the CSR drift wake decay table size hit the limit of 100k points\n");
    }
  } else 
    dx = dx0;
  fx = pow(xEnd/dx, 1./(*n));

  if (!(*FdNorm = calloc(sizeof(**FdNorm), (*n))) ||
      !(*x = malloc(sizeof(**x)*(*n))))
    bombElegant("memory allocation failure (computeSaldinFdNorm)", NULL);

  for (ix=0; ix<*n; ix++)
    (*x)[ix] = ix==0 ? 0 : ipow(fx, ix-1)*dx;
  for (is=0; is<ns; is++) {
    /* don't use s=0 as it is singular */
    s = (is+1.0)*sMax/ns;
    sh = s*ipow(gamma, 3)/radius;
    phihm = bendingAngle*gamma;
    t1 = 12*sh;
    t2 = sqrt(64+144*sh*sh);
    phihs = pow(t1+t2, 1./3.) - pow(-t1 + t2, 1./3.);
    xhLowerLimit = -1;
    if (phihs>phihm)
      xhLowerLimit = sh - phihm - ipow(phihm, 3)/6 + sqrt(sqr(ipow(phihm, 3)-6*sh) + 9*ipow(phihm, 4))/6;
    xUpperLimit = 0.999*s/(1-beta);
    for (ix=0; ix<*n; ix++) {
      if ((*x)[ix]>=xUpperLimit)
        break;
      xh = (*x)[ix]*gamma/radius;
      (*FdNorm)[ix] += Saldin5354Factor(xh, sh, phihm, xhLowerLimit);
    }
  }

  /* average over s */
  for (ix=0; ix<*n; ix++)
    (*FdNorm)[ix] /= ns;
  
  /* get the first nonzero and also the maximum value of Fd */
  for (ix=f0=fmax=0; ix<*n; ix++) {
    f= (*FdNorm)[ix];
    if (f0==0 && f>0)
      f0 = f;
    if (fmax<f)
      fmax = f;
  }
  if (fmax>f0/0.99) {
    fprintf(stderr, "Warning: possible problem with SALDIN54 drift mode: too few (%ld) points. Max/start-1 is %le\n",
            ns,
            fmax/f0-1);
  }
  switch (match_string(normMode, allowedNormMode, 2, 0)) {
  case 0:
    /* first */
    f = f0;
    break;
  case 1:
    /* peak */
    f = fmax;
    break;
  default:
    fprintf(stderr, "Error: unknown Saldin-54 normalization mode: %s\n", normMode);
    f = 0; /* suppress spurious compiler warning */
    exitElegant(1);
    break;
  }
  if (f)
    for (ix=0; ix<*n; ix++)
      (*FdNorm)[ix] /= f;
  else
    for (ix=0; ix<*n; ix++)
      (*FdNorm)[ix] = 0;
}

double SolveForPsiSaldin54(double xh, double sh)
{
  double s_sum, s_diff2, bestSol;
  double solList[4] = {-1, -1, -1, -1};
  long nSols=0, sol;

  s_sum = (-2*xh - sqrt(-8 + 4*pow(xh,2) - 
                         (4*pow(2,0.3333333333333333)*(-1 + pow(xh,2)))/
                         pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                               3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                                    pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                                          3*pow(xh,4),2)),0.3333333333333333) + 
                         2*pow(2,0.6666666666666666)*
                         pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                               3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                                    pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                                          3*pow(xh,4),2)),0.3333333333333333)))/2.;
  if (!isnan(s_sum)) {
    s_diff2 = (-16 + 8*pow(xh,2) + (4*pow(2,0.3333333333333333)*(-1 + pow(xh,2)))/
              pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                    3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                         pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                               3*pow(xh,4),2)),0.3333333333333333) - 
              2*pow(2,0.6666666666666666)*
              pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                    3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                         pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                               3*pow(xh,4),2)),0.3333333333333333) + 
              (16*(-3*sh + pow(xh,3)))/
              sqrt(-8 + 4*pow(xh,2) - 
                   (4*pow(2,0.3333333333333333)*(-1 + pow(xh,2)))/
                   pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
            3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                 pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                       3*pow(xh,4),2)),0.3333333333333333) + 
                   2*pow(2,0.6666666666666666)*
                   pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                         3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                              pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                                    3*pow(xh,4),2)),0.3333333333333333)))/4.;
    if (s_diff2>=0) {
      solList[0] = s_sum+sqrt(s_diff2);
      solList[1] = s_sum+sqrt(s_diff2);
      nSols = 2;
    }
  }
  
  s_sum =    (-2*xh + sqrt(-8 + 4*pow(xh,2) - 
                            (4*pow(2,0.3333333333333333)*(-1 + pow(xh,2)))/
                            pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                  3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                                       pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                                             3*pow(xh,4),2)),0.3333333333333333) + 
                            2*pow(2,0.6666666666666666)*
                            pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                  3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                                       pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                                             3*pow(xh,4),2)),0.3333333333333333)))/2.;
  if (!isnan(s_sum)) {
    s_diff2 = (-16 + 8*pow(xh,2) + (4*pow(2,0.3333333333333333)*(-1 + pow(xh,2)))/
              pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                    3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                         pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                               3*pow(xh,4),2)),0.3333333333333333) - 
              2*pow(2,0.6666666666666666)*
              pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                    3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                         pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                               3*pow(xh,4),2)),0.3333333333333333) - 
              (16*(-3*sh + pow(xh,3)))/
              sqrt(-8 + 4*pow(xh,2) - 
                   (4*pow(2,0.3333333333333333)*(-1 + pow(xh,2)))/
                   pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                         3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                              pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                                    3*pow(xh,4),2)),0.3333333333333333) + 
                   2*pow(2,0.6666666666666666)*
                   pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                         3*pow(xh,4) + sqrt(4*pow(-1 + pow(xh,2),3) + 
                                              pow(2 + 9*pow(sh,2) - 3*pow(xh,2) - 6*sh*pow(xh,3) + 
                                                    3*pow(xh,4),2)),0.3333333333333333)))/4.;
  
    if (s_diff2>=0) {
      solList[nSols] = s_sum+sqrt(s_diff2);
      solList[nSols+1] = s_sum-sqrt(s_diff2);
      nSols += 2;
    }
  }
  bestSol = solList[0];
  for (sol=0; sol<nSols; sol++) {
    if (solList[sol]>bestSol) {
      bestSol = solList[sol];
    }     
  }
  return bestSol;
}

double Saldin5354Factor(double xh, double sh, double phihm, double xhLowerLimit)
{
  double t1, t2, f, psi, psi2;
  if (xh<xhLowerLimit) {
    /* use Saldin 53 */
    t1 = (ipow(phihm, 3) + 3*xh*sqr(phihm) - 6*sh);
    t2 = 3*(phihm+2*xh);
    f = 2/(phihm+2*xh)*(1 + (t1 + t2)/sqrt(t1*t1+sqr(phihm*t2))) - 1/sh;
  } else {
    if ((psi = SolveForPsiSaldin54(xh, sh))>=0) {
      psi2 = psi*psi;
      f =  4*(2*xh*(psi2+1)+psi*(psi2+2))/
        (4*xh*xh*(psi2+1)+4*xh*psi*(psi2+2)+psi2*(psi2+4)) - 1/sh;
    } else
      return 0;
  }
  if (isnan(f) || isinf(f))
    f = 0;
  return f;
}

void exactDrift(double **part, long np, double length)
{
  long i;
  double *coord;

#ifdef HAVE_GPU
  if(getElementOnGpu()){
    startGpuTimer();
    gpu_exactDrift(np, length);
#ifdef GPU_VERIFY     
    startCpuTimer();
    exactDrift(part, np, length);
    compareGpuCpu(np, "exactDrift");
#endif /* GPU_VERIFY */
    return;
  }
#endif /* HAVE_GPU */

  for (i=0; i<np; i++) {
    coord = part[i];
    coord[0] += coord[1]*length;
    coord[2] += coord[3]*length;
    coord[4] += length*sqrt(1+sqr(coord[1])+sqr(coord[3]));
  }
}


double SolveForPhiStupakov(double x, double ds, double phim);
void DumpStupakovOutput(char *filename, SDDS_DATASET *SDDSout, long *active,
                        double zTravel, double *ctHist, double *ctHistDeriv,
                        double *dGamma, long nBins, double dct, 
                        double MPCharge, double dz,
                        long nCaseC, long nCaseD1,long nCaseD2,
                        double x, double dsMax, double phi0, double phi1) ;

double SolveForPhiStupakovDiffSum = 0;
long SolveForPhiStupakovDiffCount = 0;

long track_through_driftCSR_Stupakov(double **part, long np, CSRDRIFT *csrDrift, 
				     double Po, double **accepted, double zStart, CHARGE *charge, char *rootname)
{
  long iPart, iKick, iBin, binned=0, nKicks;
  long nCaseC, nCaseD1, nCaseD2;
  double ctLower, ctUpper, ds;
  long nBins, maxBins, nBinned, diBin;
  double *coord, p, beta, dz, factor, dz0, dzFirst;
  double zTravel, dct, zOutput;
  double *ctHist=NULL, *ctHistDeriv=NULL, *phiSoln=NULL;
  double length;
  long nBins1;
  double dsMax, x;
  TRACKING_CONTEXT tContext;
  LSCKICK lscKick;
#if USE_MPI
  long binned_total=1, np_total=1;
  double *buffer;
#endif

  getTrackingContext(&tContext);

  SolveForPhiStupakovDiffCount = 0;
  SolveForPhiStupakovDiffSum = 0;
  
  length = csrDrift->length;
  if (zStart!=csrWake.zLast) {
    length += (dzFirst = zStart-csrWake.zLast);
    /* propagate beam back so we can tranverse the missing length including CSR
     */
    if (isSlave || !notSinglePart)
      for (iPart=0; iPart<np; iPart++) {
	coord = part[iPart];
	coord[0] -= dzFirst*coord[1];
	coord[2] -= dzFirst*coord[3];
	if (csrDrift->linearOptics)
	  coord[4] -= dzFirst;
	else
	  coord[4] -= dzFirst*sqrt(1+sqr(coord[1])+sqr(coord[3]));
      }
    zStart = csrWake.zLast;
  }
  zOutput = zStart;  /* absolute coordinate used for output of data vs z or s */
  
  if (csrDrift->dz>0) {
    if ((nKicks = length/csrDrift->dz+0.5)<1)
      nKicks = 1;
  } else 
    nKicks = csrDrift->nKicks;
  if (nKicks<=0)
    bombElegant("nKicks=0 in CSR drift.", NULL);
  dz = (dz0=length/nKicks)/2;
  
  zTravel = zStart-csrWake.z0;  /* total distance traveled by radiation to reach this point */

  maxBins = nBins = csrWake.bins;
  nBins1 = nBins-1;
  if (!(ctHist=SDDS_Malloc(sizeof(*ctHist)*nBins)) ||
      !(ctHistDeriv=SDDS_Malloc(sizeof(*ctHistDeriv)*nBins)) ||
      !(phiSoln=SDDS_Malloc(sizeof(*phiSoln)*nBins)))
    bombElegant("memory allocation failure (track_through_driftCSR)", NULL);
  
  if ((lscKick.bins = csrDrift->LSCBins)>0) {
    lscKick.interpolate = csrDrift->LSCInterpolate;
    lscKick.radiusFactor = csrDrift->LSCRadiusFactor;
    lscKick.lowFrequencyCutoff0 = csrDrift->LSCLowFrequencyCutoff0;
    lscKick.lowFrequencyCutoff1 = csrDrift->LSCLowFrequencyCutoff1;
    lscKick.highFrequencyCutoff0 = csrDrift->LSCHighFrequencyCutoff0;
    lscKick.highFrequencyCutoff1 = csrDrift->LSCHighFrequencyCutoff1;
    lscKick.backtrack = 0;
  }
  for (iKick=0; iKick<nKicks; iKick++) {
    /* first drift is dz=dz0/2, others are dz0 */
    if (iKick==1)
      dz = dz0;
    zTravel += dz;
    zOutput += dz;
    
    x = zTravel/csrWake.rho;
    dsMax = csrWake.rho/24*pow(csrWake.bendingAngle, 3)
      *(csrWake.bendingAngle+4*x)/(csrWake.bendingAngle+x);
    /* propagate particles forward, converting s to c*t=s/beta */
    if (isSlave || !notSinglePart) {
      for (iPart=0; iPart<np; iPart++) {
	coord = part[iPart];
	coord[0] += coord[1]*dz;
	coord[2] += coord[3]*dz;
	p = Po*(1+coord[5]);
	beta = p/sqrt(p*p+1);
	if (csrDrift->linearOptics)
	  coord[4] = (coord[4]+dz)/beta;
	else
	  coord[4] = (coord[4]+dz*sqrt(1+sqr(coord[1])+sqr(coord[3])))/beta;
      }
    }
    /* bin the particle distribution */
    ctLower = ctUpper = dct = 0;
    nBinned = binParticleCoordinate(&ctHist, &maxBins,
				    &ctLower, &ctUpper, &dct, &nBins, 
				    csrWake.binRangeFactor<1.1?1.1:csrWake.binRangeFactor,
				    part, np, 4);
#if USE_MPI
  if (notSinglePart) {
    if (isSlave)
      MPI_Allreduce(&np, &np_total, 1, MPI_LONG, MPI_SUM, workers);    
    MPI_Allreduce(&nBinned, &binned_total, 1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD);
  }

  if (notSinglePart) {  /* Master needs to know the information to write the result */
    buffer = malloc(sizeof(double) * nBins);
    MPI_Allreduce(ctHist, buffer, nBins, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(ctHist, buffer, sizeof(double)*nBins);
    free(buffer);
  }
  if ((myid==1) && (np_total!=binned_total)) {
    dup2(fd,fileno(stdout)); /* Let the first slave processor write the output */
    printf("Only %ld of %ld particles binned for CSRDRIFT (%s, BRF=%le, Stupakov)\n", 
	    binned_total, np_total,
	    tContext.elementName, csrWake.binRangeFactor);
    fflush(stdout);
#else
  if (nBinned!=np) {
    printf("Only %ld of %ld particles binned for CSRDRIFT (%s, BRF=%le, Stupakov)\n", 
	    nBinned, np,
	    tContext.elementName, csrWake.binRangeFactor);
#endif
    printf("ct min, max = %21.15e, %21.15e, dct = %21.15e, nBins=%ld, maxBins=%ld\n",
	    ctLower, ctUpper, dct, nBins, maxBins);
    fflush(stdout);
#if USE_MPI
#if defined(_WIN32)
    freopen("NUL","w",stdout); 
#else
    freopen("/dev/null","w",stdout); 
#endif
#endif 
  }
      
    /* - smooth the histogram, normalize to get linear density, and 
       copy in preparation for taking derivative
       */
    if (csrWake.highFrequencyCutoff0>0 || csrWake.lowFrequencyCutoff0>=0) {
      long nz;
      nz = applyLHPassFilters(ctHist, nBins, 
                              csrWake.lowFrequencyCutoff0, csrWake.lowFrequencyCutoff1,
                              csrWake.highFrequencyCutoff0, csrWake.highFrequencyCutoff1,
                              csrWake.clipNegativeBins);
      if (nz && negativeWarningsLeft) {
        printf("Warning: low pass filter resulted in negative values in %ld bins\n",
                nz);
        if (--negativeWarningsLeft==0)
          printf("         Further warnings will be suppressed for this run.\n");
        fflush(stdout);
      }
    }

    if (csrWake.SGHalfWidth>0) {
      SavitzkyGolaySmooth(ctHist, nBins, csrWake.SGOrder, csrWake.SGHalfWidth, csrWake.SGHalfWidth,  0);
#if (!USE_MPI)
      correctDistribution(ctHist, nBins, 1.0*nBinned);
#else
      if (notSinglePart)
	correctDistribution(ctHist, nBins, 1.0*binned_total);
      else
	correctDistribution(ctHist, nBins, 1.0*nBinned);
#endif
    }
    for (iBin=0; iBin<nBins; iBin++)
      ctHistDeriv[iBin] = (ctHist[iBin] /= dct);
    /* - compute derivative with smoothing.  The deriv is w.r.t. index number and
     * I won't scale it now as it will just fall out in the integral 
     */
    SavitzkyGolaySmooth(ctHistDeriv, nBins, csrWake.SGDerivOrder, 
                       csrWake.SGDerivHalfWidth, csrWake.SGDerivHalfWidth, 1);

    /* Case C */ 
    nCaseC = 0;
    nCaseD1 = 0;
    nCaseD2 = 0;
    for (iBin=0; iBin<nBins; iBin++) {
      double f;
      ds = csrWake.rho/6*sqr(csrWake.bendingAngle)*(csrWake.bendingAngle + 3*x);
      diBin = ds/dct;
      if (iBin+diBin<nBins) {
        f = -1/(csrWake.bendingAngle+2*x); 
        csrWake.dGamma[iBin] = f*ctHist[iBin+diBin];
        nCaseC++;
      } else
        csrWake.dGamma[iBin] = 0;
    }
    /* Case D */
    for (iBin=0; iBin<nBins; iBin++) {
      phiSoln[iBin] = -1;
      if ((ds = iBin*dct)>dsMax)
        break;
      phiSoln[iBin] = SolveForPhiStupakov(x, iBin*dct/csrWake.rho, csrWake.bendingAngle);
    }
    for (iBin=0; iBin<nBins; iBin++) {
      long jBin, first, count;
      double term1=0, term2=0;
      diBin = dsMax/dct;
      if (iBin+diBin<nBins) {
        nCaseD1 ++;
        csrWake.dGamma[iBin] += ctHist[iBin+diBin]/(csrWake.bendingAngle+2*x);
      }
      first = 1;
      count = 0;
      for (jBin=iBin; jBin<nBins; jBin++) {
        double phi;
        if ((phi = phiSoln[jBin-iBin])>=0) {
          /* I put in a negative sign here because my s is opposite in direction to 
           * Saldin et al. and Stupakov, so my derivative has the opposite sign.
           * Note lack of ds factor here as I use the same one in my unnormalized derivative.
           */
          if (phi>0) {
            /* ^^^ If I test phi+2*x here, I get noisy, unphysical results very close
             * to the dipole exit 
             */
            term2 = ctHistDeriv[jBin]/(phi+2*x);
            csrWake.dGamma[iBin] -= term2;
            if (first) {
              term1 = term2;
              first = 0;
            }
            count++;
            nCaseD2++;
          }
        } else
          break;
      }
      if (count>1 && csrWake.trapazoidIntegration)
        /* trapazoid rule correction for ends */
        csrWake.dGamma[iBin] += (term1+term2)/2;
    }
    /* the minus sign adjusts for Stupakov using wake<0 to indicate energy gain
     */
    factor = -4/csrWake.rho*csrWake.GSConstant*dz0;
    for (iBin=0; iBin<nBins; iBin++)
      csrWake.dGamma[iBin] *= factor;

    if (csrWake.wffValues) 
      applyFilterTable(csrWake.dGamma, nBins, dct/c_mks, csrWake.wffValues, csrWake.wffFreqValue,
                       csrWake.wffRealFactor, csrWake.wffImagFactor);

    if ((csrDrift->StupakovOutput || csrWake.StupakovFileActive) && 
        (csrDrift->StupakovOutputInterval<2 || iKick%csrDrift->StupakovOutputInterval==0)) {
      double x, dsMax, phi0, phi1;
      if (!csrWake.StupakovFileActive) {
        if (!SDDS_CopyString(&csrWake.StupakovOutput, csrDrift->StupakovOutput))
          bombElegant("string copying problem preparing Stupakov output for CSRDRIFT", NULL);
        csrWake.StupakovOutput = compose_filename(csrWake.StupakovOutput, rootname);
      }
      x = zTravel/csrWake.rho;
      dsMax = csrWake.rho/24*pow(csrWake.bendingAngle, 3)
        *(csrWake.bendingAngle+4*x)/(csrWake.bendingAngle+x);
      phi0 = SolveForPhiStupakov(x, 0.0, csrWake.bendingAngle);
      phi1 = SolveForPhiStupakov(x, dsMax/csrWake.rho*0.999, csrWake.bendingAngle);
      
      /* note that the contents of ctHist and ctHistDeriv are corrupted by this operation */
      DumpStupakovOutput(csrWake.StupakovOutput, &csrWake.SDDS_Stupakov, 
                         &csrWake.StupakovFileActive, zTravel,
                         ctHist, ctHistDeriv, csrWake.dGamma, nBins, dct, csrWake.MPCharge,
                         dz0, nCaseC, nCaseD1, nCaseD2,
                         x, dsMax/csrWake.rho, phi0, phi1);
    }
    
    /* apply kick to each particle and convert back to normal coordinates */
    if (isSlave || !notSinglePart) {
      for (iPart=binned=0; iPart<np; iPart++) {
	double f;
	coord = part[iPart];
	iBin = (f=(coord[4]-ctLower)/dct);
	f -= iBin;
	if (iBin>=0 && iBin<nBins1) {
          if (csrDrift->csr)
            coord[5] += ((1-f)*csrWake.dGamma[iBin] + f*csrWake.dGamma[iBin+1])/Po;
	  binned ++;
	} else {
	  printf("Particle out of bin range---not kicked: ct-ctLower=%21.15e, dct=%21.15e, iBin=%ld\n",
		  coord[4]-ctLower, dct, iBin);
	}
	p = (1+coord[5])*Po;
	beta = p/sqrt(p*p+1);
	coord[4] = beta*coord[4];
      }
    }

    if (tContext.sliceAnalysis && tContext.sliceAnalysis->active &&
	(csrDrift->sliceAnalysisInterval==0 ||
	 iKick%csrDrift->sliceAnalysisInterval==0)) {
#if USE_MPI
      /* This function will be parallelized in the future */
      printf("performSliceAnalysisOutput is not supported in parallel mode currently.\n");
      MPI_Barrier(MPI_COMM_WORLD); /* Make sure the information can be printed before aborting */
      MPI_Abort(MPI_COMM_WORLD, 1); 
#endif
	performSliceAnalysisOutput(tContext.sliceAnalysis, part, np, 
				   0, tContext.step, Po, 
				   csrWake.MPCharge*np,
				   tContext.elementName, 
				   zOutput, 0);
    }
#if USE_MPI
    if (isSlave && notSinglePart) {
      MPI_Allreduce(&binned, &binned_total, 1, MPI_LONG, MPI_SUM, workers);
    }
    if ((myid==1) && (np_total!=binned_total)) {
      dup2(fd,fileno(stdout)); /* Let the first slave processor write the output */
      printf("Only %ld of %ld particles kicked for CSRDRIFT (%s, BRF=%le, Stupakov)\n", 
	      binned_total, np_total,
	      tContext.elementName, csrWake.binRangeFactor);
#else
    if (np!=binned) {
      printf("Only %ld of %ld particles kicked for CSRDRIFT (%s, BRF=%le, Stupakov)\n", 
	      binned, np,
	      tContext.elementName, csrWake.binRangeFactor);
#endif
      printf("ct min, max = %21.15e, %21.15e, dct = %21.15e, nBins=%ld, maxBins=%ld\n",
	      ctLower, ctUpper, dct, nBins, maxBins);
      fflush(stdout);
#if USE_MPI
#if defined(_WIN32)
    freopen("NUL","w",stdout); 
#else
    freopen("/dev/null","w",stdout); 
#endif
#endif 
    }

    if (csrDrift->LSCBins>0)
      addLSCKick(part, np, &lscKick, Po, charge, dz, 0.0);
  }
  
  /* do final drift of dz0/2 */
  dz = dz0/2;
  if (isSlave || !notSinglePart)
    for (iPart=0; iPart<np; iPart++) {
      coord = part[iPart];
      coord[0] += coord[1]*dz;
      coord[2] += coord[3]*dz;
      if (csrDrift->linearOptics)
	coord[4] += dz;
      else
	coord[4] += dz*sqrt(1+sqr(coord[1])+sqr(coord[3]));
    }    

  if (csrDrift->LSCBins>0)
    addLSCKick(part, np, &lscKick, Po, charge, dz, 0.0);

  csrWake.zLast = zStart + length;
  free(ctHist);
  free(ctHistDeriv);
  free(phiSoln);
#if DEBUG
  if (SolveForPhiStupakovDiffCount)
    printf("Phi solution accuracy for %ld solutions: %le\n",
            SolveForPhiStupakovDiffCount, SolveForPhiStupakovDiffSum/SolveForPhiStupakovDiffCount);
#endif
  return np;
}

static double SolveForPhiStupakov_x, SolveForPhiStupakov_4x;

double SolveForPhiStupakovFn(double phi)
{
  return phi*phi*phi*(phi+SolveForPhiStupakov_4x)/(phi+SolveForPhiStupakov_x);
}

/* solve for phi:  ds=phi^3/24*(phi+4*x)/(phi+x), where ds = (s-s')/rho */

double SolveForPhiStupakov(double x, double ds, double phim)
{
  double phi;
  static double phiLast = -1;
  
  if (ds<0)
    return -1;
  if (ds==0)
    return 0;
  
  ds *= 24;
  SolveForPhiStupakov_x = x;
  SolveForPhiStupakov_4x = 4*x;

  if (phiLast==-1)
    phiLast = phim/2;

  /* try phim first */
  if (fabs(ds-SolveForPhiStupakovFn(phim))<ds/1e4) {
    phiLast = phim;
    return phim;
  }
  
  /* try a solution with Newton's method */
  phi = zeroNewton(SolveForPhiStupakovFn, ds, phiLast, phim/1000, 3, ds/1e4);
  if (phi<0 || phi>phim || fabs(ds - SolveForPhiStupakovFn(phi))>ds/1e4) 
    /* try a more plodding method */
    phi = zeroInterp(SolveForPhiStupakovFn, ds, 0, phim*1.01, phim/100, ds/1e4);
  if (phi<0 || phi>phim)
    return -1;
  phiLast = phi;
  SolveForPhiStupakovDiffCount ++;
  SolveForPhiStupakovDiffSum += fabs(ds - SolveForPhiStupakovFn(phi));
  return phi;
}


/* this procedure destroys the contents of ctHist and ctHistDeriv ! */

void DumpStupakovOutput(char *filename, SDDS_DATASET *SDDSout, long *active,
                        double zTravel, double *ctHist, double *ctHistDeriv,
                        double *dGamma, long nBins, double dct, 
                        double MPCharge, double dz,
                        long nCaseC, long nCaseD1, long nCaseD2,
                        double x, double dsMax, double phi0, double phi1) 
{
  long i;
  if (!*active) {
    if (!SDDS_InitializeOutputElegant(SDDSout, SDDS_BINARY, 1, NULL, NULL, filename) ||
        0>SDDS_DefineParameter(SDDSout, "SVNVersion", NULL, NULL, "SVN version number", NULL, SDDS_STRING, SVN_VERSION) ||
        !SDDS_DefineSimpleParameter(SDDSout, "z", "m", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleParameter(SDDSout, "CaseC", "#", SDDS_LONG) ||
        !SDDS_DefineSimpleParameter(SDDSout, "CaseD1", "#", SDDS_LONG) ||
        !SDDS_DefineSimpleParameter(SDDSout, "CaseD2", "#", SDDS_LONG) ||
        !SDDS_DefineSimpleParameter(SDDSout, "x", NULL, SDDS_DOUBLE) ||
        !SDDS_DefineSimpleParameter(SDDSout, "dsMax", NULL, SDDS_DOUBLE) ||
        !SDDS_DefineSimpleParameter(SDDSout, "phi0", NULL, SDDS_DOUBLE) ||
        !SDDS_DefineSimpleParameter(SDDSout, "phi1", NULL, SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(SDDSout, "s", "m", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(SDDSout, "LinearDensity", "C/s", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(SDDSout, "LinearDensityDeriv", "C/s$a2$n", SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(SDDSout, "DeltaGamma", NULL, SDDS_DOUBLE) ||
        !SDDS_DefineSimpleColumn(SDDSout, "GammaDeriv", "1/m", SDDS_DOUBLE) ||
        !SDDS_WriteLayout(SDDSout)) {
      SDDS_SetError("Problem setting up output file for CSRDRIFT (Stupakov mode)");
      SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
    }
    *active = 1;
  }
  for (i=0; i<nBins; i++) {
    ctHist[i] *= MPCharge*c_mks;
    ctHistDeriv[i] *= MPCharge*sqr(c_mks)/dct;
  }
  if (!SDDS_StartPage(SDDSout, nBins) ||
      !SDDS_SetColumn(SDDSout, SDDS_SET_BY_NAME, dGamma, nBins, "DeltaGamma")  ||
      !SDDS_SetColumn(SDDSout, SDDS_SET_BY_NAME, ctHist, nBins, "LinearDensity")  ||
      !SDDS_SetColumn(SDDSout, SDDS_SET_BY_NAME, ctHistDeriv, nBins, "LinearDensityDeriv")) {
    SDDS_SetError("Problem writing to output file for CSRDRIFT (Stupakov mode)");
    SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
  }
  /* use ctHist array for output of s and ctHistDeriv for dGamma/ds */
  for (i=0; i<nBins; i++) {
    ctHist[i] = dct*(i+0.5-nBins/2);
    ctHistDeriv[i] = dGamma[i]/dz;
  }
  if (!SDDS_SetColumn(SDDSout, SDDS_SET_BY_NAME, ctHist, nBins, "s") ||
      !SDDS_SetColumn(SDDSout, SDDS_SET_BY_NAME, ctHistDeriv, nBins, "GammaDeriv") ||
      !SDDS_SetParameters(SDDSout, SDDS_SET_BY_NAME|SDDS_PASS_BY_VALUE, 
                          "z", zTravel, "CaseC", nCaseC,
                          "CaseD1", nCaseD1, "CaseD2", nCaseD2, 
                          "x", x, "dsMax", dsMax, "phi0", phi0, "phi1", phi1,
                          NULL) ||
      !SDDS_WritePage(SDDSout)) {
    SDDS_SetError("Problem writing to output file for CSRDRIFT (Stupakov mode)");
    SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
  }
  if (!inhibitFileSync)
    SDDS_DoFSync(SDDSout);
}


void apply_edge_effects(
                        double *x, double *xp, double *y, double *yp, 
                        double rho, double n, double beta, double he, double psi, long which_edge
                        )
/* Applies edge effects using non-symplectic K. L. Brown method to second order */
{
  double h, tan_beta, tan2_beta, sec_beta, sec2_beta, h2;
  double R21, R43;
  double T111, T133, T211, T441, T331, T221, T233, T243, T431, T432;
  double x0, xp0, y0, yp0;

  h = 1/rho;
  R21 = h*(tan_beta=tan(beta));
  R43 = -h*tan(beta-psi);

  h2 = sqr(h);
  T111 = which_edge*h/2*(tan2_beta=sqr(tan_beta));
  T133 = -which_edge*h/2*(sec2_beta=sqr(sec_beta=1./cos(beta)));
  T211 = which_edge==-1?
    -n*h2*tan_beta:
    -h2*(n+tan2_beta/2)*tan_beta;
  T441 = -(T331 = T221 = -which_edge*h*tan2_beta);
  T233 =  which_edge==-1?
    h2*(n+.5+tan2_beta)*tan_beta:
    h2*(n-tan2_beta/2)*tan_beta;
  T243 = which_edge*h*tan2_beta;
  T431 = h2*(2*n+(which_edge==1?sec2_beta:0))*tan_beta;
  T432 = which_edge*h*sec2_beta;
  if (he!=0) {
    double term;
    term = h/2*he*sec2_beta*sec_beta;
    T211 += term;
    T233 -= term;
    T431 -= 2*term;
  }

  x0 = *x;  xp0 = *xp;  y0 = *y;  yp0 = *yp;
  *x  = x0  + T111*sqr(x0) + T133*sqr(y0);
  *xp = xp0 + R21*x0 + T211*sqr(x0) + T221*x0*xp0 + T233*sqr(y0) + T243*y0*yp0;
  *y  = y0  + T331*x0*y0;
  *yp = yp0 + R43*y0 + T441*yp0*x0 + T431*x0*y0 + T432*xp0*y0;
}

/* dipole fringe effects tracking, based on work of Kilean Hwang. Not symplectic for edgeOrder>=2 */

void dipoleFringeKHwang(double *Qf, double *Qi,
                     double rho, double inFringe, long edgeOrder, double K1, double edge, double gap, double fint, double Rhe)
{
  double dx, dpx, dy, dpy;
  double tan_edge, sin_edge, sec_edge, cos_edge;
  double x0, px0, y0, py0, dp0;
  /* double psi, Kg; */
  double k0, k3, k2;
  double k4, k5, k6;

  k0 = sqr(PI)/6.;
  k2 = fint;
  k3 = 1.0*1./6.;
  /* Kg = gap*fint; */
  k4 = -1.0*sqr(PI)/3.;
  k5 = 0.0;
  k6 = -1.0;

  x0  = Qi[0];  
  px0 = Qi[1];  
  y0  = Qi[2];  
  py0 = Qi[3];  
  dp0 = Qi[5];
  dx = dpx = dy = dpy = 0;
  /* psi = Kg/rho/cos(edge)*(1+sqr(sin(edge))); */

  sec_edge=1./cos(edge);
  tan_edge=tan(edge);
  sin_edge=sin(edge);
  cos_edge=cos(edge);
  

  if (edgeOrder>1) {

    /* entrance */
    if (inFringe==-1.) {
      dx  =   inFringe*ipow(sec_edge,2)*ipow(gap,2)*k0/rho/(1+dp0)
        + inFringe*ipow(x0,2)*ipow(tan_edge,2)/2/rho/(1+dp0) 
          - inFringe*ipow(y0,2)*ipow(sec_edge,2)/2/rho/(1+dp0);
      dy  =  -inFringe*x0*y0*ipow(tan_edge,2)/rho/(1+dp0);
      dpx  =  -1.*ipow(sec_edge,3)*sin_edge*ipow(gap,2)*k0/rho/rho/(1+dp0)
        +tan_edge*x0/rho
          +ipow(y0,2)/2*(2*ipow(tan_edge,3))/ipow(rho,2)/(1+dp0)
            +ipow(y0,2)/2*(ipow(tan_edge,1))/ipow(rho,2)/(1+dp0)
              -inFringe*(x0*px0-y0*py0)*ipow(tan_edge,2)/rho/(1+dp0)
		+k4*ipow(sin_edge,2)*ipow(gap,2)/2/ipow(cos_edge,3)/rho*Rhe
                  -k5*x0*ipow(sin_edge,1)*ipow(gap,1)/ipow(cos_edge,3)/rho*Rhe
                    +k6*(y0*y0-x0*x0)/2/ipow(cos_edge,3)/rho*Rhe;
      dpy  =  -1.*tan_edge*y0/rho 
        +k2*y0*(1+ipow(sin_edge,2))*gap/(1+dp0)/ipow(rho,2)/ipow(cos_edge,3)
          +inFringe*(x0*py0+y0*px0)*ipow(tan_edge,2)/rho/(1+dp0)
            +inFringe*y0*px0/rho/(1+dp0)
              +k3*ipow(y0,3)*(2./3./cos_edge-4./3./ipow(cos_edge,3))/(1+dp0)/rho/rho/gap
		+k6*x0*y0/ipow(cos_edge,3)/rho*Rhe;
    }
    /* exit */
    if (inFringe==1.) {
      dx  =   inFringe*ipow(sec_edge,2)*ipow(gap,2)*k0/rho/(1+dp0)
        + inFringe*ipow(x0,2)*ipow(tan_edge,2)/2/rho/(1+dp0) 
          - inFringe*ipow(y0,2)*ipow(sec_edge,2)/2/rho/(1+dp0);
      dy  =  -inFringe*x0*y0*ipow(tan_edge,2)/rho/(1+dp0);
      dpx  =  tan_edge*x0/rho
          -ipow(y0,2)/2*(1*ipow(tan_edge,3))/ipow(rho,2)/(1+dp0)
            -ipow(x0,2)/2*(1*ipow(tan_edge,3))/ipow(rho,2)/(1+dp0)
              -inFringe*(x0*px0-y0*py0)*ipow(tan_edge,2)/rho/(1+dp0)
		+k4*ipow(sin_edge,2)*ipow(gap,2)/2/ipow(cos_edge,3)/rho*Rhe
                  -k5*x0*ipow(sin_edge,1)*ipow(gap,1)/ipow(cos_edge,3)/rho*Rhe
                    +k6*(y0*y0-x0*x0)/2/ipow(cos_edge,3)/rho*Rhe;
      dpy  =  -1.*tan_edge*y0/rho 
        +k2*y0*(1+ipow(sin_edge,2))*gap/(1+dp0)/ipow(rho,2)/ipow(cos_edge,3)
          +inFringe*(x0*py0+y0*px0)*ipow(tan_edge,2)/rho/(1+dp0)
            +inFringe*y0*px0/rho/(1+dp0)
              +x0*y0*ipow(sec_edge,2)*tan_edge/ipow(rho,2)/(1+dp0)
		+k3*ipow(y0,3)*(2./3./cos_edge-4./3./ipow(cos_edge,3))/(1+dp0)/rho/rho/gap
                  -k5*y0*ipow(sin_edge,1)*ipow(gap,1)/ipow(cos_edge,3)/rho*Rhe
                    +k6*x0*y0/ipow(cos_edge,3)/rho*Rhe;
    }
    
  } else {
    /* linear terms in transverse coordinates only */

    /* entrance */
    if (inFringe==-1.) {
      dx  =   inFringe*ipow(sec_edge,2)*ipow(gap,2)*k0/rho/(1+dp0);
      dy  =  0;
      dpx  =  -1.*ipow(sec_edge,3)*sin_edge*ipow(gap,2)*k0/rho/rho/(1+dp0)
        +tan_edge*x0/rho
          +k4*ipow(sin_edge,2)*ipow(gap,2)/2/ipow(cos_edge,3)/rho*Rhe
            -k5*x0*ipow(sin_edge,1)*ipow(gap,1)/ipow(cos_edge,3)/rho*Rhe;
      dpy  =  -1.*tan_edge*y0/rho 
        +k2*y0*(1+ipow(sin_edge,2))*gap/(1+dp0)/ipow(rho,2)/ipow(cos_edge,3);
    }

    /* exit */
    if (inFringe==1.) {
      dx  =   inFringe*ipow(sec_edge,2)*ipow(gap,2)*k0/rho/(1+dp0);
      dy  =  0;
      dpx  =  tan_edge*x0/rho
        +k4*ipow(sin_edge,2)*ipow(gap,2)/2/ipow(cos_edge,3)/rho*Rhe
          -k5*x0*ipow(sin_edge,1)*ipow(gap,1)/ipow(cos_edge,3)/rho*Rhe;
      dpy  =  -1.*tan_edge*y0/rho 
        +k2*y0*(1+ipow(sin_edge,2))*gap/(1+dp0)/ipow(rho,2)/ipow(cos_edge,3)
          -k5*y0*ipow(sin_edge,1)*ipow(gap,1)/ipow(cos_edge,3)/rho*Rhe;
    }
  }
  
  Qf[0]  = x0  + dx;
  Qf[1] = px0 + dpx;
  Qf[2]  = y0  + dy;
  Qf[3] = py0 + dpy;
  Qf[5] = Qi[5];
  /*  printf("x %f y %f xp %f yp %f dp0 %f\n", *x, *y, *xp, *yp, dp0); */
}

/* Symplectic higher-order dipole fringe effects tracking, based on work of Kilean Hwang as further developed by Ryan Linberg */

void dipoleFringeKHwangRLindberg(double *Qf, double *Qi,
				 double rho, double inFringe, double K1, double edge, double gap, double fint, double Rhe)
{
  double tan_edge, sin_edge, sec_edge, cos_edge;
  double cos3_edge, sec2_edge, tan2_edge, tan3_edge;
  double x0, px0, y0, py0, dp0;
  double x1, px1, y1, py1;
  double x2, px2, y2, py2;
  double x3, px3, y3, py3;
  double x4, px4, y4, py4;
  double x5, px5, y5, py5;
  double k0, k3, k2;
  double k4, k5, k6;
  double t1, t2, rho2;
  
  k0 = sqr(PI)/6.;
  k2 = fint;
  k3 = 1.0*1./6.;
  k4 = -1.0*sqr(PI)/3.;
  k5 = 0.0;
  k6 = -1.0;
  rho2 = sqr(rho);
  
  x0  = Qi[0];  
  px0 = Qi[1];  
  y0  = Qi[2];  
  py0 = Qi[3];  
  dp0 = Qi[5];

  sec_edge=1./cos(edge);
  tan_edge=tan(edge);
  sin_edge=sin(edge);
  cos_edge=cos(edge);

  sec2_edge = ipow(sec_edge, 2);
  cos3_edge = ipow(cos_edge, 3);
  tan2_edge = ipow(tan_edge, 2);
  tan3_edge = ipow(tan_edge, 3);
  
  if (inFringe==-1) {
    /* entrance */

    x1 = x0;
    px1 = px0 + tan_edge/rho*x0 + tan_edge/(2*rho2*(1+dp0))*sqr(y0) 
      - tan3_edge/(rho2*(1+dp0))*sqr(x0) - gap*k5*sin_edge*Rhe/(rho*cos3_edge)*x0 
      + k6*sec2_edge*Rhe/(2*rho)*(sqr(y0)-sqr(x0));
    y1 = y0;
    py1 = py0 - tan_edge/rho*y0 + tan_edge/(rho2*(1+dp0))*x0*y0
      + gap*k2*(1+sqr(sin_edge))/(rho2*(1+dp0)*cos3_edge)*y0
      + 2*k3*(sqr(cos_edge)-2)/(3*gap*rho2*cos3_edge)*ipow(y0,3)
      + gap*k5*sin_edge*Rhe/(rho*cos3_edge)*y0 + k6*ipow(sec_edge,3)*Rhe/rho*x0*y0;

    t1 = (1 + tan2_edge/(2*rho*(1+dp0))*x1);
    x2 = x1/t1;
    px2 = px1*sqr(t1);
    y2 = y1;
    py2 = py1;

    t1 = sec2_edge/(rho*(1+dp0));
    x3 = x2 + t1/2*sqr(y2);
    px3 = px2;
    y3 = y2;
    py3 = py2 - t1*y2*px2;

    t1 = tan2_edge/(rho*(1+dp0));
    t2 = exp(t1*x3);
    x4 = x3;
    px4 = px3 - t1*y3*py3;
    y4 = y3*t2;
    py4 = py3/t2;

    t1 = sqr(gap)*k0*sec2_edge/(rho*(1+dp0));
    x5 = x4 - t1;
    px5 = px4 - t1*tan_edge/rho + sqr(gap)*k4*sqr(sin_edge)*Rhe/(2*rho*cos3_edge);
    y5 = y4;
    py5 = py4;
  } else {
    /* exit */

    x1 = x0;
    px1 = px0 + tan_edge/rho*x0 + tan3_edge/(2*rho2*(1+dp0))*sqr(y0) 
      + tan3_edge/(2*rho2*(1+dp0))*sqr(x0) - gap*k5*sin_edge*Rhe/(rho*cos3_edge)*x0 
      + k6*sec2_edge*Rhe/(2*rho)*(sqr(y0)-sqr(x0));
    y1 = y0;
    py1 = py0 - tan_edge/rho*y0 + tan3_edge/(rho2*(1+dp0))*x0*y0
      + gap*k2*(1+sqr(sin_edge))/(rho2*(1+dp0)*cos3_edge)*y0
      + 2*k3*(sqr(cos_edge)-2)/(3*gap*rho2*cos3_edge)*ipow(y0,3)
      + gap*k5*sin_edge*Rhe/(rho*cos3_edge)*y0 + k6*ipow(sec_edge,3)*Rhe/rho*x0*y0;

    t1 = (1 - tan2_edge/(2*rho*(1+dp0))*x1);
    x2 = x1/t1;
    px2 = px1*sqr(t1);
    y2 = y1;
    py2 = py1;

    t1 = sec2_edge/(rho*(1+dp0));
    x3 = x2 - t1/2*sqr(y2);
    px3 = px2;
    y3 = y2;
    py3 = py2 + t1*y2*px2;

    t1 = -tan2_edge/(rho*(1+dp0));  
    t2 = exp(t1*x3);
    x4 = x3;
    px4 = px3 - t1*y3*py3;
    y4 = y3*t2;
    py4 = py3/t2;

    x5 = x4 + sqr(gap)*k0*sec2_edge/(rho*(1+dp0));
    px5 = px4 + sqr(gap)*k4*sqr(sin_edge)*Rhe/(2*rho*cos3_edge);
    y5 = y4;
    py5 = py4;
  }
  
  Qf[0]  = x5;
  Qf[1] =  px5;
  Qf[2]  = y5;
  Qf[3] =  py5;
  Qf[5] =  dp0;
  
}

/* this is used solely to convert coordinates inside the element for
 * the purpose of generating output.  It ignores misalignments.
 */

void convertFromCSBendCoords(double **part, long np, double rho0, 
			     double cos_ttilt, double sin_ttilt, 
			     long ctMode)
{
  long ip;
  double x, y, xp, yp, *coord;

  for (ip=0; ip<np; ip++) {
    coord = part[ip];

    x  =  X*cos_ttilt -  Y*sin_ttilt;
    y  =  X*sin_ttilt +  Y*cos_ttilt;
    xp = XP*cos_ttilt - YP*sin_ttilt;
    yp = XP*sin_ttilt + YP*cos_ttilt;

    X  = x;
    Y  = y;
    XP = xp;
    YP = yp;

    if (ctMode)
      coord[4] /= c_mks;
  }
}


/* this is used solely to undo the transformation done by 
 * convertFromCSBendCoords
 */

void convertToCSBendCoords(double **part, long np, double rho0, 
			     double cos_ttilt, double sin_ttilt, long ctMode)
{
  long ip;
  double x, y, xp, yp, *coord;

  for (ip=0; ip<np; ip++) {
    coord = part[ip];

    x  =   X*cos_ttilt +  Y*sin_ttilt;
    y  =  -X*sin_ttilt +  Y*cos_ttilt;
    xp =  XP*cos_ttilt + YP*sin_ttilt;
    yp = -XP*sin_ttilt + YP*cos_ttilt;

    X  = x;
    Y  = y;
    XP = xp;
    YP = yp;

    if (ctMode)
      coord[4] *= c_mks;
  }
}

#include "fftpackC.h"
long applyLowPassFilter(double *histogram, long bins, 
			double start,   /* in units of Nyquist frequency */
			double end      /* in units of Nyquist frequency */
			)
{
  long i, i1, i2;
  double fraction, dfraction, sum;
  double *realimag;
  long frequencies;

  if (!(realimag = (double*)malloc(sizeof(*realimag)*(bins+2))))
    SDDS_Bomb("allocation failure");

  if (end<start)
    end = start;

  frequencies = bins/2 + 1;
  realFFT2(realimag, histogram, bins, 0);

  i1 = start*frequencies;
  if (i1<0) 
    i1=0;
  if (i1>frequencies-1)
    i1 = frequencies-1;
  
  i2 = end*frequencies;
  if (i2<0) 
    i2=0;
  if (i2>frequencies-1)
    i2 = frequencies-1;
  
  dfraction = i1==i2? 0 : 1./(i2-i1);
  fraction = 1;
  for (i=i1; i<=i2; i++) {
    realimag[2*i  ] *= fraction;
    realimag[2*i+1] *= fraction;
    if ((fraction -= dfraction)<0)
      fraction = 0;
  }
  for (; i<frequencies; i++) {
    realimag[2*i  ] = 0;
    realimag[2*i+1] = 0;
  }

  realFFT2(realimag, realimag, bins, INVERSE_FFT);

  /* copy data to input buffer.
   * normalize to keep the sum constant
   * don't allow negative values 
   */
  for (i=sum=0; i<bins; i++) {
    sum += histogram[i];
    histogram[i] = realimag[i];
  }
  free(realimag);
  return correctDistribution(histogram, bins, sum);
}

long applyLHPassFilters(double *histogram, long bins, 
			double startHP,   /* in units of Nyquist frequency */
			double endHP,     /* in units of Nyquist frequency */
			double startLP,   /* in units of Nyquist frequency */
			double endLP,     /* in units of Nyquist frequency */
                        long clipNegative
			)
{
  long i, i1, i2;
  double fraction, dfraction, sum;
  double *realimag;
  long frequencies;

  if (!(realimag = (double*)malloc(sizeof(*realimag)*(bins+2))))
    SDDS_Bomb("allocation failure");

  if (endLP<startLP)
    endLP = startLP;
  if (endHP<startHP)
    endHP = startHP;
  
  frequencies = bins/2 + 1;
  realFFT2(realimag, histogram, bins, 0);

  if (startLP>0) {
    i1 = startLP*frequencies;
    if (i1<0) 
      i1=0;
    if (i1>frequencies-1)
      i1 = frequencies-1;
    
    i2 = endLP*frequencies;
    if (i2<0) 
      i2=0;
    if (i2>frequencies-1)
      i2 = frequencies-1;
    
    dfraction = i1==i2? 0 : 1./(i2-i1);
    fraction = 1;
    for (i=i1; i<=i2; i++) {
      realimag[2*i  ] *= fraction;
      realimag[2*i+1] *= fraction;
      if ((fraction -= dfraction)<0)
        fraction = 0;
    }
    for (; i<frequencies; i++) {
      realimag[2*i  ] = 0;
      realimag[2*i+1] = 0;
    }
  }
  
  if (startHP>0) {
    i1 = startHP*frequencies;
    if (i1<0) 
      i1=0;
    if (i1>frequencies-1)
      i1 = frequencies-1;
    
    i2 = endHP*frequencies;
    if (i2<0) 
      i2=0;
    if (i2>frequencies-1)
      i2 = frequencies-1;
    
    dfraction = i1==i2? 0 : 1./(i2-i1);
    fraction = 0;
    for (i=0; i<i1; i++) {
      realimag[2*i  ] = 0;
      realimag[2*i+1] = 0;
    }
    for (i=i1; i<=i2; i++) {
      realimag[2*i  ] *= fraction;
      realimag[2*i+1] *= fraction;
      if ((fraction += dfraction)>1)
        fraction = 1;
    }
  }
  
  realFFT2(realimag, realimag, bins, INVERSE_FFT);

  /* copy data to input buffer  */
  for (i=sum=0; i<bins; i++) {
    sum += histogram[i];
    histogram[i] = realimag[i];
  }
  free(realimag);

  if (clipNegative)
    /* normalize to keep the sum constant
     * don't allow negative values 
     */
    return correctDistribution(histogram, bins, sum);
  else
    return 0;
}

long correctDistribution(double *array, long npoints, double desiredSum)
{
  double sum, factor;
  long nz, i;
  for (i=nz=sum=0; i<npoints; i++) {
    if (array[i]<0) {
      nz ++;
      array[i] = 0;
    }
    sum += array[i];
  }
  if (!sum)
    return nz;
  factor = desiredSum/sum;
  for (i=0; i<npoints; i++)
    array[i] *= factor;
  return nz;
}

void computeEtiltCentroidOffset(double *dcoord_etilt, double rho0, double angle, double etilt, double tilt)
{
  /* compute final offsets due to error-tilt of the magnet */
  /* see pages 90-93 of notebook 1 about this */
  double q1a, q2a, q3a;
  double q1b, q2b, q3b;
  double qp1, qp2, qp3; 
  double dz, tan_alpha, k;
  
  if (!etilt) {
    fill_double_array(dcoord_etilt, 6L, 0.0);
    return;
  }

  etilt *= -1; /* consistent sign convention with TILT */

  q1a = (1-cos(angle))*rho0*(cos(etilt)-1);
  q2a = 0;
  q3a = (1-cos(angle))*rho0*sin(etilt);
  qp1 = sin(angle)*cos(etilt);
  qp2 = cos(angle);
  k = sqrt(sqr(qp1)+sqr(qp2));
  qp1 /= k;
  qp2 /= k;
  qp3 = sin(angle)*sin(etilt)/k;
  tan_alpha = 1./tan(angle)/cos(etilt);
  q1b = q1a*tan_alpha/(tan(angle)+tan_alpha);
  q2b = -q1b*tan(angle);
  dz  = sqrt(sqr(q1b-q1a)+sqr(q2b-q2a));
  q3b = q3a + qp3*dz;

  dcoord_etilt[0] = sqrt(sqr(q1b) + sqr(q2b));
  dcoord_etilt[1] = tan(atan(tan_alpha)-(PIo2-angle));
  dcoord_etilt[2] = q3b;
  dcoord_etilt[3] = qp3;
  dcoord_etilt[4] = dz*sqrt(1+sqr(qp3));
  dcoord_etilt[5] = 0;

#ifdef DEBUG
  printf("pre-tilt offsets due to ETILT=%le:  %le %le %le %le %le\n",
          etilt, dcoord_etilt[0], dcoord_etilt[1], dcoord_etilt[2],
          dcoord_etilt[3], dcoord_etilt[4]);
  fflush(stdout);
#endif

  /* rotate by tilt to get into same frame as bend equations. */
  rotateCoordinatesForMisalignment(dcoord_etilt, -tilt);
#ifdef DEBUG
  printf("offsets due to ETILT=%le:  %le %le %le %le %le\n",
          etilt, dcoord_etilt[0], dcoord_etilt[1], dcoord_etilt[2],
          dcoord_etilt[3], dcoord_etilt[4]);
  fflush(stdout);
#endif
}

void readWakeFilterFile(long *values, 
                        double **freq, double **real, double **imag, 
                        char *freqName, char *realName, char *imagName,
                        char *filename)
{
  SDDS_DATASET SDDSin;
  long i;
  
  if (!SDDS_InitializeInputFromSearchPath(&SDDSin, filename) || !SDDS_ReadPage(&SDDSin)) {
    fprintf(stderr, "Error: unable to open or read CSRCSBEND wake filter file %s\n", filename);
    exitElegant(1);
  }
  if ((*values = SDDS_RowCount(&SDDSin))<2) {
    fprintf(stderr, "Error: too little data in CSRCSBEND wake filter file %s\n", filename);
    exitElegant(1);
  }
  if (!freqName || !strlen(freqName))
    SDDS_Bomb("WFF_FREQ_COLUMN is blank in CSRCSBEND");
  if (SDDS_CheckColumn(&SDDSin, freqName, "Hz", SDDS_ANY_FLOATING_TYPE, stdout)!=SDDS_CHECK_OK) {
    fprintf(stderr, "Error: column %s invalid in CSRCSBEND wake filter file %s---check existence, type, and units (Hz).\n", 
            freqName, filename);
    exitElegant(1);
  }
  if (!realName || !strlen(realName))
    SDDS_Bomb("WFF_REAL_COLUMN is blank in CSRCSBEND");
  if (SDDS_CheckColumn(&SDDSin, realName, NULL, SDDS_ANY_FLOATING_TYPE, stdout)!=SDDS_CHECK_OK) {
    fprintf(stderr, "Error: column %s invalid in CSRCSBEND wake filter file %s---check existence and type.\n", 
            realName, filename);
    exitElegant(1);
  }
  if (!imagName || !strlen(imagName))
    SDDS_Bomb("WFF_IMAG_COLUMN is blank in CSRCSBEND");
  if (SDDS_CheckColumn(&SDDSin, imagName, NULL, SDDS_ANY_FLOATING_TYPE, stdout)!=SDDS_CHECK_OK) {
    fprintf(stderr, "Error: column %s invalid in CSRCSBEND wake filter file %s---check existence and type.\n", 
            imagName, filename);
    exitElegant(1);
  }
  if (!(*freq=SDDS_GetColumnInDoubles(&SDDSin, freqName)) ||
      !(*real=SDDS_GetColumnInDoubles(&SDDSin, realName)) ||
      !(*imag=SDDS_GetColumnInDoubles(&SDDSin, imagName))) {
    fprintf(stderr, "Problem getting data from CSRCSBEND wake filter file %s.\n", filename);
    SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
    exitElegant(1);
  }
  for (i=1; i<*values; i++) {
    if ((*freq)[i-1]>=(*freq)[i]) {
      fprintf(stderr, "Error: frequency data is not monotonically increasing in CSRCSBEND wake filter file %s.\n", filename);
      exitElegant(1);
    }
  }
}

void applyFilterTable(double *function, long bins, double dx, long fValues,
                     double *fFreq, double *fReal, double *fImag)
{
  long i, i1, i2;
  double f;
  double *realimag, dfrequency, length;
  long frequencies;
  double sum;
  
  if (!(realimag = (double*)malloc(sizeof(*realimag)*(bins+2))))
    SDDS_Bomb("allocation failure");

  frequencies = bins/2 + 1;
  length = dx*(bins-1);
  dfrequency = 1.0/length;
  realFFT2(realimag, function, bins, 0);
  
  for (i=0; i<frequencies; i++) {
    long code;
    i1 = 2*i+0;
    i2 = 2*i+1;
    f = i*dfrequency;
    realimag[i1] *= interp(fReal, fFreq, fValues, f, 0, 1, &code);
    realimag[i2] *= interp(fImag, fFreq, fValues, f, 0, 1, &code);
  }
  realFFT2(realimag, realimag, bins, INVERSE_FFT);

  /* copy data to input buffer.
   */
  for (i=sum=0; i<bins; i++) 
    function[i] = realimag[i];
  free(realimag);
}

void addRadiationKick(double *Qx, double *Qy, double *dPoP, double *sigmaDelta2,
		      double x, double y, double theta, double thetaf, double h0, double Fx, double Fy,
		      double ds, double radCoef, double dsISR, double isrCoef,
                      long distributionBased, long includeOpeningAngle, double meanPhotonsPerMeter,
                      double normalizedCriticalEnergy0, double Po)
{
  double f, xp, yp, F2, F, deltaFactor, dsFactor;
  
  f = (1+x*h0)/sqrt(sqr(1+*dPoP)-sqr(*Qx)-sqr(*Qy));
  xp = *Qx*f;
  yp = *Qy*f;
  dsFactor = sqrt(sqr(1+x*h0)+sqr(xp)+sqr(yp));
  F2 = sqr(Fx)+sqr(Fy);

  if (!distributionBased) {
    deltaFactor = sqr(1 + *dPoP);
    *Qx /= (1 + *dPoP);
    *Qy /= (1 + *dPoP);
    if (radCoef)
      *dPoP -= radCoef*deltaFactor*F2*ds*dsFactor;
    if (isrCoef>0)
      /* The minus sign is for consistency with the previous version. */
      *dPoP -= isrCoef*deltaFactor*pow(F2,0.75)*sqrt(dsISR*dsFactor)*gauss_rn_lim(0.0, 1.0, srGaussianLimit, random_2);
    if (sigmaDelta2)
      *sigmaDelta2 += sqr(isrCoef*deltaFactor)*pow(F2,1.5)*dsISR*dsFactor;
    *Qx *= (1 + *dPoP);
    *Qy *= (1 + *dPoP);
  } else {
    double dtheta=0, dphi=0;
    double yph, logyph;
    double normalizedCriticalEnergy;
    double nMean, dDelta, thetaRms;
    long i, nEmitted;
    long rhoSign;
    F = sqrt(F2);
    rhoSign = SIGN(h0);
    /* Compute the mean number of photons emitted = meanPhotonsPerMeter*meters */
    /* Note that unlike the #photons/radian, this is independent of energy */
    nMean = meanPhotonsPerMeter*dsISR*dsFactor*F;
    /* Pick the actual number of photons emitted from Poisson distribution */
    nEmitted = inversePoissonCDF(nMean, random_2(1));
    /* Adjust normalized critical energy to local field strength (FSE is already included via rho_actual) */
    normalizedCriticalEnergy = normalizedCriticalEnergy0*F;
    /* For each photon, pick its energy and emission angles */
    for (i=0; i<nEmitted; i++) {
      /* Pick photon energy normalized to critical energy */
      yph=pickNormalizedPhotonEnergy(random_2(1));
      /* Multiply by critical energy normalized to central beam energy, adjusting for variation with
       * individual electron energy offset. Note that it goes like (1+delta)^2, not (1+delta)^3 
       * because the bending radius also depends on (1+delta) 
       */
      dDelta = normalizedCriticalEnergy*sqr(1 + *dPoP)*yph;
      photonCount ++;
      energyCount += yph;
      /* Change the total electron momentum */
      *dPoP -= dDelta;
      if (includeOpeningAngle) {
        /* Compute rms spread in electron angle = (rms photon angle)*dDelta */
        logyph = log10(yph);
        thetaRms = dDelta*pow(10,
                              -2.418673276661232e-01
                              + logyph*(-4.472680955382907e-01+logyph*(-4.535350424882360e-02
                                                                   -logyph*6.181818621278201e-03)))/Po;
        /* Compute change in electron angle due to photon angle */
        dtheta = thetaRms*gauss_rn_lim(0.0, 1.0, srGaussianLimit, random_2);
        dphi = thetaRms*gauss_rn_lim(0.0, 1.0, srGaussianLimit, random_2);
        if (SDDSphotons)
          logPhoton(dDelta*Po, x, xp-dtheta/dDelta, y, yp-dphi/dDelta, theta, thetaf, 1/h0);
        /* rhoSign factor is for backward compatibility */
        xp += dtheta*rhoSign;
        yp += dphi*rhoSign;
      } else {
        if (SDDSphotons)
          logPhoton(dDelta*Po, x, xp-dtheta/dDelta, y, yp-dphi/dDelta, theta, thetaf, 1/h0);
      }
    }
    f = (1 + *dPoP)/sqrt(sqr(1+x*h0)+sqr(xp)+sqr(yp));
    *Qx = xp*f;
    *Qy = yp*f;
  }
  
}

long inversePoissonCDF(double mu, double C)
{
  double sum, expMinusMu, term;
  long r, rMax;
  
  r = 0;
  if ((rMax = 50*mu)<10)
    rMax = 10;
  expMinusMu = exp(-mu);
  term = sum = expMinusMu;
  while (r<=rMax && C>=sum) {
    term *= mu/(++r);
    sum += term;
  }
  /* fprintf(stderr, "inversePoissonCDF: r=%ld for mu=%e, C=%e\n", r, mu, C); */
  return r;
}

/* Return randomly-chosen photon energy normalized to the critical energy */
double pickNormalizedPhotonEnergy(double RN) 
{
  long interpCode;
  double value;
  static double ksiTable[200] = {
1.000000000000000e-07, 1.103351074554523e-07, 1.217383310646075e-07, 1.343200559586096e-07, 1.482020747927429e-07,
1.635189113578160e-07, 1.804187271717404e-07, 1.990651067932036e-07, 2.196385495378513e-07, 2.423384085535426e-07,
2.673842889192374e-07, 2.950186137528527e-07, 3.255088868939687e-07, 3.591505332405233e-07, 3.962690515521040e-07,
4.372236992560780e-07, 4.824109227537678e-07, 5.322685173365927e-07, 5.872789374272963e-07, 6.479745823257918e-07,
7.149429940622619e-07, 7.888329490825732e-07, 8.703595415651299e-07, 9.603117573208774e-07, 1.059560346180786e-06,
1.169066741218290e-06, 1.289890855361064e-06, 1.423201921186706e-06, 1.570290408531488e-06, 1.732581081736759e-06,
1.911644944229382e-06, 2.109214732888093e-06, 2.327202951554652e-06, 2.567720983189001e-06, 2.833097369770374e-06,
3.125899929822278e-06, 3.448963034857157e-06, 3.805415579829938e-06, 4.198708930670648e-06, 4.632648449870741e-06,
5.111434736502657e-06, 5.639704554113462e-06, 6.222573518736223e-06, 6.865680963315421e-06, 7.575252265320169e-06,
8.358158724774669e-06, 9.221982709850737e-06, 1.017508139438384e-05, 1.122668092062936e-05, 1.238696398672945e-05,
1.366716918178818e-05, 1.507968136669955e-05, 1.663817388115531e-05, 1.835773664119261e-05, 2.025502748788774e-05,
2.234840008736815e-05, 2.465811862080412e-05, 2.720654509486478e-05, 3.001836976776838e-05, 3.312079173855694e-05,
3.654384295607505e-05, 4.032066206311618e-05, 4.448784496925588e-05, 4.908569920644159e-05, 5.415873276357582e-05,
5.975605436563109e-05, 6.593190648243325e-05, 7.274602260436053e-05, 8.026436452046312e-05, 8.855971070475770e-05,
9.771244913354602e-05, 1.078111117026042e-04, 1.189534517802125e-04, 1.312473286188584e-04, 1.448118705606887e-04,
1.597782996280689e-04, 1.762914808166189e-04, 1.945112638414439e-04, 2.146141870658145e-04, 2.367947478384840e-04,
2.612676279365202e-04, 2.882697280504828e-04, 3.180626634504074e-04, 3.509347182037930e-04, 3.872040386295810e-04,
4.272217166086854e-04, 4.713754443547046e-04, 5.200925166529275e-04, 5.738444084509028e-04, 6.331514454110419e-04,
6.985881553542330e-04, 7.707878748140519e-04, 8.504493031356108e-04, 9.383435740575402e-04, 1.035322090321881e-03,
1.142323583631410e-03, 1.260383487096962e-03, 1.390644637378946e-03, 1.534368742954572e-03, 1.692947192375836e-03,
1.867914439234830e-03, 2.060964191854564e-03, 2.273966197872862e-03, 2.508982774009835e-03, 2.768287894108917e-03,
3.054391670052556e-03, 3.370064913211098e-03, 3.718364390303022e-03, 4.102660002531562e-03, 4.526671789275104e-03,
4.994505863796759e-03, 5.510692966986859e-03, 6.080227102658105e-03, 6.708621448554406e-03, 7.401960924834442e-03,
8.166960997457158e-03, 9.011022498794720e-03, 9.942316075907171e-03, 1.096985907697749e-02, 1.210360516825099e-02,
1.335452196639184e-02, 1.473471854468308e-02, 1.625755786686624e-02, 1.793779326078131e-02, 1.979167811637735e-02,
2.183715833862031e-02, 2.409403673367596e-02, 2.658418068941497e-02, 2.933167681381903e-02, 3.236312132092089e-02,
3.570786033002121e-02, 3.939830569107346e-02, 4.347015239952341e-02, 4.796281661196050e-02, 5.291978786613488e-02,
5.838910396032759e-02, 6.442366592223334e-02, 7.108188831787103e-02, 7.842822357081021e-02, 8.653385973120035e-02,
9.547720673026440e-02, 1.053448316706813e-01, 1.162322544203413e-01, 1.282449697899047e-01, 1.414991970199129e-01,
1.561232236887442e-01, 1.722586122799903e-01, 1.900616973883389e-01, 2.097047392306722e-01, 2.313778527830923e-01,
2.552908366685073e-01, 2.816753658394816e-01, 3.107867644741113e-01, 3.429067722728065e-01, 3.783463152734567e-01,
4.174487166108054e-01, 4.605924179038830e-01, 5.081949415377639e-01, 5.607170865377987e-01, 6.186676294134220e-01,
6.826074965088431e-01, 7.531554325044472e-01, 8.309943513916955e-01, 9.168782178988778e-01, 1.011638437307961e+00,
1.116191954411967e+00, 1.231550862322391e+00, 1.358832474428039e+00, 1.499269100004806e+00, 1.654219599620752e+00,
1.825183916868001e+00, 2.013817817791925e+00, 2.221947831729780e+00, 2.451587713539253e+00, 2.704960411634972e+00,
2.984519634347505e+00, 3.292972664221137e+00, 3.633303772560254e+00, 4.008807416353689e+00, 4.423119788364888e+00,
4.880253623874576e+00, 5.384631444881934e+00, 5.941135706944927e+00, 6.555154946882635e+00, 7.232636842499024e+00,
7.980135322277263e+00, 8.804886289535018e+00, 9.714875109915180e+00, 1.071891743371295e+01, 1.182672581369469e+01,
1.304902401296616e+01, 1.439764568247248e+01, 1.588565738231238e+01, 1.752745256838863e+01, 1.933892408641842e+01,
2.133760842747432e+01, 2.354287285156119e+01, 2.597604764912193e+01, 2.866068635656761e+01, 3.162277660168377e+01,
  };
  static double FTable[200] = {
0.000000000000000e+00, 1.916076787477782e-04, 3.896006996482199e-04, 5.941918318862451e-04, 8.056009324383097e-04,
1.024055848381587e-03, 1.249790750550654e-03, 1.483048166648730e-03, 1.724078746036354e-03, 1.973142196708657e-03,
2.230505581886648e-03, 2.496445345396121e-03, 2.771247236692068e-03, 3.055207274452791e-03, 3.348630028390361e-03,
3.651830594751359e-03, 3.965134731031564e-03, 4.288879835176022e-03, 4.623413241840414e-03, 4.969094094184835e-03,
5.326293748409966e-03, 5.695396753910589e-03, 6.076799201662367e-03, 6.470910424341261e-03, 6.878153743490802e-03,
7.298967431515415e-03, 7.733803160081558e-03, 8.183127443537616e-03, 8.647422816544402e-03, 9.127188753348749e-03,
9.622940276393589e-03, 1.013520903749105e-02, 1.066454503150837e-02, 1.121151744173274e-02, 1.177671348365064e-02,
1.236073899369794e-02, 1.296422080554008e-02, 1.358780748228750e-02, 1.423216848805338e-02, 1.489799412460975e-02,
1.558599872144761e-02, 1.629692120583610e-02, 1.703152471578071e-02, 1.779059568966145e-02, 1.857494805017956e-02,
1.938542355142036e-02, 2.022289197174982e-02, 2.108824911944652e-02, 2.198242222447622e-02, 2.290636998738409e-02,
2.386108352008151e-02, 2.484758298036615e-02, 2.586692442491193e-02, 2.692019946744418e-02, 2.800853716992024e-02,
2.913309895920114e-02, 3.029508724043934e-02, 3.149574455091229e-02, 3.273635664445640e-02, 3.401824527268329e-02,
3.534277891084329e-02, 3.671137126806664e-02, 3.812548585471972e-02, 3.958662612641901e-02, 4.109634873515033e-02,
4.265626122542306e-02, 4.426802842804741e-02, 4.593335936539433e-02, 4.765402350963371e-02, 4.943184769110832e-02,
5.126872354805775e-02, 5.316659282366442e-02, 5.512746484065590e-02, 5.715341374017695e-02, 5.924658623150298e-02,
6.140918640941097e-02, 6.364349310252398e-02, 6.595185823803101e-02, 6.833671466591608e-02, 7.080056062143326e-02,
7.334597653614160e-02, 7.597562485484224e-02, 7.869225776043576e-02, 8.149870143512994e-02, 8.439787179667740e-02,
8.739277614110098e-02, 9.048652052440445e-02, 9.368229400251835e-02, 9.698338259579789e-02, 1.003931731738744e-01,
1.039151601553213e-01, 1.075529299247917e-01, 1.113101721273651e-01, 1.151906862423360e-01, 1.191983871317649e-01,
1.233372898347266e-01, 1.276115170379167e-01, 1.320253087906072e-01, 1.365830262538971e-01, 1.412891370418868e-01,
1.461482173909104e-01, 1.511649654280279e-01, 1.563442022251273e-01, 1.616908577721921e-01, 1.672099659551390e-01,
1.729066816736924e-01, 1.787862779608226e-01, 1.848541324986933e-01, 1.911157129897795e-01, 1.975765981791920e-01,
2.042424693223850e-01, 2.111190968366426e-01, 2.182123129823302e-01, 2.255280364093208e-01, 2.330722555939911e-01,
2.408510146807283e-01, 2.488703695479055e-01, 2.571364147677684e-01, 2.656552557248533e-01, 2.744329918358574e-01,
2.834756510338721e-01, 2.927892168723024e-01, 3.023795848158047e-01, 3.122525396990814e-01, 3.224136624350130e-01,
3.328683532505147e-01, 3.436217660758924e-01, 3.546787751835536e-01, 3.660438466342482e-01, 3.777210511874600e-01,
3.897139689097260e-01, 4.020256374046105e-01, 4.146583795759362e-01, 4.276137967068266e-01, 4.408926345108801e-01,
4.544946994027770e-01, 4.684186411623745e-01, 4.826619073829413e-01, 4.972205662043195e-01, 5.120891723276200e-01,
5.272605134477985e-01, 5.427255018050126e-01, 5.584729557729631e-01, 5.744894096219003e-01, 5.907588309168045e-01,
6.072624513895929e-01, 6.239785205673239e-01, 6.408820784452591e-01, 6.579446823895981e-01, 6.751342192611512e-01,
6.924146905220633e-01, 7.097460264751286e-01, 7.270839321656986e-01, 7.443798099419868e-01, 7.615807315005799e-01,
7.786294876113157e-01, 7.954647879517510e-01, 8.120215670459850e-01, 8.282314411609915e-01, 8.440233375507971e-01,
8.593244148619452e-01, 8.740611430503441e-01, 8.881606055288961e-01, 9.015520524341847e-01, 9.141687939214495e-01,
9.259502059677074e-01, 9.368438193470870e-01, 9.468075155760183e-01, 9.558117659236037e-01, 9.638415906288208e-01,
9.708980284014210e-01, 9.769991556010101e-01, 9.821804314564566e-01, 9.864941142754962e-01, 9.900075473722704e-01,
9.928006021230259e-01, 9.949622418923878e-01, 9.965864066620354e-01, 9.977674409043544e-01, 9.985957390441925e-01,
9.991538996011060e-01, 9.995138052403904e-01, 9.997348433078885e-01, 9.998634876235176e-01, 9.999340435105117e-01,
9.999702897374164e-01, 9.999876125809349e-01, 9.999952573365697e-01, 9.999983471293699e-01, 9.999994807411241e-01,
9.999998545219742e-01, 9.999999640793696e-01, 9.999999922833868e-01, 9.999999985785893e-01, 9.999999997790499e-01,
9.999999999715266e-01, 9.999999999970198e-01, 9.999999999997560e-01, 9.999999999999872e-01, 1.000000000000000e+00,
  };
  value = interp(ksiTable, FTable, 200, RN, 0, 2, &interpCode);
  if (!interpCode)
    return ksiTable[0];
  return value;
}

void addCorrectorRadiationKick(double **coord, long np, ELEMENT_LIST *elem, long type, double Po, double *sigmaDelta2, long disableISR)
{
  double F2;
  double kick, length;
  double isrCoef, radCoef, dp, p, beta0, beta1, deltaFactor;
  short isr, sr;
  long i;

#ifdef HAVE_GPU
  if(getElementOnGpu()){
    startGpuTimer();
    gpu_addCorrectorRadiationKick(np, elem, type, Po, sigmaDelta2, disableISR);
#ifdef GPU_VERIFY     
    startCpuTimer();
    addCorrectorRadiationKick(coord, np, elem, type, Po, sigmaDelta2, disableISR);
    compareGpuCpu(np, "addCorrectorRadiationKick");
#endif /* GPU_VERIFY */
    return;
  }
#endif /* HAVE_GPU */

  if (!np)
    return;

  isr = sr = 0;

  switch (type) {
  case T_HCOR:
    kick = ((HCOR*)elem->p_elem)->kick;
    if ((length = ((HCOR*)elem->p_elem)->length)==0) 
      length = ((HCOR*)elem->p_elem)->lEffRad;
    if (((HCOR*)elem->p_elem)->synchRad) {
      sr = 1;
      if (((HCOR*)elem->p_elem)->isr) 
	isr = 1;
    }
    break;
  case T_VCOR:
    kick = ((VCOR*)elem->p_elem)->kick;
    if ((length = ((VCOR*)elem->p_elem)->length)==0) 
      length = ((VCOR*)elem->p_elem)->lEffRad;
    if (((VCOR*)elem->p_elem)->synchRad) {
      sr = 1;
      if (((VCOR*)elem->p_elem)->isr) 
	isr = 1;
    }
    break;
  case T_HVCOR:
    kick = sqrt(sqr(((HVCOR*)elem->p_elem)->xkick)+sqr(((HVCOR*)elem->p_elem)->ykick));
    if ((length = ((HVCOR*)elem->p_elem)->length)==0) 
      length = ((HVCOR*)elem->p_elem)->lEffRad;
    if (((HVCOR*)elem->p_elem)->synchRad) {
      sr = 1;
      if (((HVCOR*)elem->p_elem)->isr) 
	isr = 1;
    }
    break;
  case T_EHCOR:
    kick = ((EHCOR*)elem->p_elem)->kick;
    if ((length = ((EHCOR*)elem->p_elem)->length)==0) 
      length = ((EHCOR*)elem->p_elem)->lEffRad;
    if (((EHCOR*)elem->p_elem)->synchRad) {
      sr = 1;
      if (((EHCOR*)elem->p_elem)->isr) 
	isr = 1;
    }
    break;
  case T_EVCOR:
    kick = ((EVCOR*)elem->p_elem)->kick;
    if ((length = ((EVCOR*)elem->p_elem)->length)==0) 
      length = ((EVCOR*)elem->p_elem)->lEffRad;
    if (((EVCOR*)elem->p_elem)->synchRad) {
      sr = 1;
      if (((EVCOR*)elem->p_elem)->isr) 
	isr = 1;
    }
    break;
  case T_EHVCOR:
    kick = sqrt(sqr(((EHVCOR*)elem->p_elem)->xkick)+sqr(((EHVCOR*)elem->p_elem)->ykick));
    if ((length = ((EHVCOR*)elem->p_elem)->length)==0) 
      length = ((EHVCOR*)elem->p_elem)->lEffRad;
    if (((EHVCOR*)elem->p_elem)->synchRad) {
      sr = 1;
      if (((EHVCOR*)elem->p_elem)->isr) 
	isr = 1;
    }
    break;
  }
  if (sr==0 || length==0) 
    return ;
  if (disableISR)
    isr = 0;
  radCoef = sqr(particleCharge)*pow3(Po)/(6*PI*epsilon_o*sqr(c_mks)*particleMass);
  isrCoef = particleRadius*sqrt(55.0/(24*sqrt(3))*pow5(Po)*137.0359895);

  F2 = sqr(kick/length);
  for (i=0; i<np; i++) {
    dp = coord[i][5];
    p = Po*(1+dp);
    beta0 = p/sqrt(sqr(p)+1);
    deltaFactor = sqr(1+dp);
    dp -= radCoef*deltaFactor*F2*length;
    if (isr)
      dp += isrCoef*deltaFactor*pow(F2, 0.75)*sqrt(length)*gauss_rn_lim(0.0, 1.0, srGaussianLimit, random_2);
    if (sigmaDelta2)
      *sigmaDelta2 += sqr(isrCoef*deltaFactor)*pow(F2, 1.5)*length;
    p = Po*(1+dp);
    beta1 = p/sqrt(sqr(p)+1);
    coord[i][5] = dp;
    coord[i][4] = beta1*coord[i][4]/beta0;
  }
  if (sigmaDelta2)
    *sigmaDelta2 /= np;
}
  
void convolveArrays1(double *output, long n, double *a1, double *a2)
{
  long ib, ib1;
  for (ib=0; ib<n; ib++) {
    output[ib] = 0;
    for (ib1=ib; ib1<n; ib1++)
      output[ib] += a1[ib1]*a2[ib1-ib];
  }
}

void setUpCsbendPhotonOutputFile(CSBEND *csbend, char *rootname, long np)
{
  TRACKING_CONTEXT tc;
#if USE_MPI
  SDDSphotons = NULL;
  return;
#endif
  if (!csbend->photonOutputFile) {
    SDDSphotons = NULL;
    return;
  }
  photonLowEnergyCutoff = csbend->photonLowEnergyCutoff;
  getTrackingContext(&tc);
  if (!csbend->photonFileActive) {
    csbend->photonOutputFile = compose_filename(csbend->photonOutputFile, rootname);
    csbend->SDDSphotons = tmalloc(sizeof(SDDS_DATASET));
    if (!SDDS_InitializeOutputElegant(csbend->SDDSphotons, SDDS_BINARY, 1, NULL, NULL, csbend->photonOutputFile) ||
        0>SDDS_DefineParameter(csbend->SDDSphotons, "Step", NULL, NULL, NULL, NULL, SDDS_LONG, NULL) ||
        0>SDDS_DefineParameter(csbend->SDDSphotons, "SVNVersion", NULL, NULL, "SVN version number", NULL, SDDS_STRING, SVN_VERSION) ||
        0>SDDS_DefineParameter(csbend->SDDSphotons, "Particles", NULL, NULL, "Number of charged particles", NULL, SDDS_LONG, NULL) ||
        0>SDDS_DefineParameter(csbend->SDDSphotons, "LowEnergyCutoff", NULL, "eV", "Minimum photon energy included in output", NULL, SDDS_DOUBLE, NULL) ||
        0>SDDS_DefineParameter(csbend->SDDSphotons, "ElementName", NULL, NULL, NULL, NULL, SDDS_STRING, tc.elementName) ||
        0>SDDS_DefineParameter(csbend->SDDSphotons, "ElementOccurence", NULL, NULL, NULL, NULL, SDDS_LONG, NULL) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSphotons, "Ep", "eV", SDDS_FLOAT) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSphotons, "x", "m", SDDS_FLOAT) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSphotons, "xp", "", SDDS_FLOAT) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSphotons, "y", "m", SDDS_FLOAT) ||
        !SDDS_DefineSimpleColumn(csbend->SDDSphotons, "yp", "", SDDS_FLOAT) ||
        !SDDS_WriteLayout(csbend->SDDSphotons)) {
      SDDS_SetError("Problem setting up photon output file for CSBEND");
      SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
    }
    csbend->photonFileActive = 1;
  }
  if (!SDDS_StartPage(csbend->SDDSphotons, 10000) || 
      !SDDS_SetParameters(csbend->SDDSphotons, SDDS_SET_BY_NAME|SDDS_PASS_BY_VALUE, "Particles", np, "Step", tc.step,
                          "LowEnergyCutoff", photonLowEnergyCutoff, "ElementName", tc.elementName, "ElementOccurence", tc.elementOccurrence, NULL)) {
    SDDS_SetError("Problem setting up photon output file for CSBEND");
    SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
  }
  photonRows = 0;
  SDDSphotons = csbend->SDDSphotons;
}

void logPhoton(double Ep, double x, double xp, double y, double yp, double thetar, double thetaf, double rho)
{
  double Xi, Zi, thetai, phii;
  double L, R;
  double XBar, thetaBar, phiBar, yBar;
  

  if ((Ep *= me_mev*1e6)<photonLowEnergyCutoff)
    return;
  
  /* emission */
  thetai = thetar - atan(xp);
  phii = atan(yp);
  Xi = -rho*(1-cos(thetar)) + x*cos(thetar);
  Zi = (x+rho)*sin(thetar);
  
  /* intersection with exit plane */
  L = (Zi*cos(thetaf)-(rho+Xi)*sin(thetaf))/cos(thetaf-thetai);
  R = ((rho+Xi)*cos(thetai)+Zi*sin(thetai))/cos(thetaf-thetai);
  XBar = R - rho;
  thetaBar = thetai - thetaf;
  phiBar = phii;
  yBar = y + L*tan(phii);

  if (!SDDS_SetRowValues(SDDSphotons, SDDS_SET_BY_INDEX|SDDS_PASS_BY_VALUE, photonRows++,
                         0, (float)Ep,
                         1, (float)XBar,
                         2, (float)(-tan(thetaBar)),
                         3, (float)yBar,
                         4, (float)tan(phiBar),
                         -1))
    SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
  if (photonRows%10000==0) {
    if (!SDDS_UpdatePage(SDDSphotons, FLUSH_TABLE))
      SDDS_PrintErrors(stderr, SDDS_EXIT_PrintErrors|SDDS_VERBOSE_PrintErrors);
  }
}

static CSBEND csbendWorking;
static long optimizationEvaluations;
static double **optParticle = NULL;

double csbend_fse_adjustment_penalty(double *value, long *invalid) 
{
  if (!optParticle) 
    optParticle = (double**)czarray_2d(sizeof(**optParticle), 1, totalPropertiesPerParticle);
  memset(optParticle[0], 0, totalPropertiesPerParticle*sizeof(**optParticle));

  csbendWorking.fseCorrectionValue = *value;
  optimizationEvaluations ++;
  if (!track_through_csbend(optParticle, 1, &csbendWorking, 0, 1e3, NULL, 0.0, NULL, NULL, NULL, NULL, NULL, -1)) {
    *invalid = 1;
    return 0.0;
  }
  *invalid = 0;
  return fabs(optParticle[0][1]);
}

static long FSEOptimizationCount = 0;

void csbend_update_fse_adjustment(CSBEND *csbend)
{
  double fseUser = 0, fse = 0, stepSize = 1e-3, lowerLimit = -1, upperLimit = 1, acc;
  short disable = 0;
  if (csbend->fseCorrection && 
      (csbend->edge_effects[csbend->e1Index]==2 || csbend->edge_effects[csbend->e2Index]==2 ||
       csbend->edge_effects[csbend->e1Index]==4 || csbend->edge_effects[csbend->e2Index]==4)) {
    if (!optParticle) 
      optParticle = (double**)czarray_2d(sizeof(**optParticle), 1, totalPropertiesPerParticle);
    fseUser = csbend->fse;
    csbend->fse = 0;
    memcpy(&csbendWorking, csbend, sizeof(csbendWorking));
    csbendWorking.dx = csbendWorking.dy = csbendWorking.dz = csbendWorking.etilt = csbendWorking.tilt = 0;
    csbendWorking.isr = csbendWorking.synch_rad = csbendWorking.fseCorrectionPathError = 0;
    optimizationEvaluations = 0;
    if (simplexMin(&acc, &fse, &stepSize, &lowerLimit, &upperLimit, &disable, 1,
                   fabs(1e-14*csbend->angle), fabs(1e-16*csbend->angle), 
                   csbend_fse_adjustment_penalty, NULL, 1500, 3, 12, 3.0, 1.0, 0)<0) {
      bombElegantVA("failed to find FSE to center trajectory for csbend. accuracy acheived was %le.", acc);
    }
    csbend->fse = fseUser;
    csbend->fseCorrectionValue = fse;
    csbend->fseCorrectionPathError = optParticle[0][4] - csbend->length;
    if (++FSEOptimizationCount<1000) {
      printf("FSE optimized to %le (%le net) for CSBEND after %ld evaluations, giving error of %le and path-length %s of %le\n",
             fse, fse+fseUser, optimizationEvaluations, acc, 
             csbend->fseCorrection==1?"adjustment":"error", csbend->fseCorrectionPathError);
      fflush(stdout);
    } else {
      if (FSEOptimizationCount==1000) {
        printf("FSE optimized to %le (%le net) for CSBEND after %ld evaluations, giving error of %le and path-length %s of %le\n",
               fse, fse+fseUser, optimizationEvaluations, acc, 
               csbend->fseCorrection==1?"adjustment":"error", csbend->fseCorrectionPathError);
        printf("Suppressing further FSE optimization messages\n");
        fflush(stdout);
      } else {
        if (FSEOptimizationCount%1000==0) {
          printf("FSE optimization done %ld times in total\n", FSEOptimizationCount);
          fflush(stdout);
        }
      }
    }
  }
}

void applySimpleDipoleEdgeKick(double *xp, double *yp, double x, double y, double delta, double rho, double ea, double psi, 
                               double kickLimit, long expanded) 
{
  /*  Apply edge effects using a symplectic method based on linear K. L. Brown matrix */
  double Qi[6];
  double dqx, dqy;

  Qi[0] = x; Qi[1] = *xp;
  Qi[2] = y; Qi[3] = *yp;
  Qi[4] = 0; Qi[5] = delta;
  convertToDipoleCanonicalCoordinates(Qi, expanded);

  dqx = tan(ea)/rho*x;
  if (kickLimit>0 && fabs(dqx)>kickLimit) {
    dqx = SIGN(dqx)*kickLimit;
  }
  dqy = -tan(ea-psi/(1+delta))/rho*y;

  Qi[1] += dqx;
  Qi[3] += dqy;
  
  convertFromDipoleCanonicalCoordinates(Qi, expanded);
  *xp = Qi[1];
  *yp = Qi[3];
}
