manuals/libatomprobe/processing_8cpp_source.html

 /* processing.cpp :  Processing mass spectra - statistical and numerical fucntions
  * Copyright (C) 2017  Daniel Haley
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #include <vector>
 #include <algorithm>
 #include <cmath>
 #include <limits>
 #include <string>
 #include <random>

 #include <gsl/gsl_sf_erf.h>
 #include <gsl/gsl_histogram.h>

 #include "atomprobe/spectrum/processing.h"
 #include "atomprobe/helper/misc.h"
 #include "atomprobe/helper/maths/misc.h"

 #include "atomprobe/helper/aptAssert.h"

 using std::string;
 using std::vector;

 namespace AtomProbe
 {


 //Anderson. test statistic for gaussian-ness. Returns false if input has insufficient points for test (2 items)
 //Implented for unknown (derived from data) mean & variance
 // reject statistic if output has this prob. of non-normality:
 // 15% - 0.576
 // 10% - 0.656
 //  5% - 0.787
 //2.5% - 0.918
 //  1% - 1.092
 //See, eg
 // http://itl.nist.gov/div898/handbook/eda/section3/eda35e.htm
 template<class T>
 bool andersonDarlingStatistic(std::vector<T> vals, float &meanV, float &stdevVal,
         float &statistic, size_t &undefCount, bool computeMeanAndStdev=true)
 {
     size_t n=vals.size();
     //we cannot compute this without more data
     if(n <= 1)
         return false;

     if(computeMeanAndStdev)
         meanAndStdev(vals,meanV,stdevVal);

     //Bring assumed gauss data into a normal dist
     for(size_t ui=0;ui<n;ui++)
         vals[ui]=(vals[ui]-meanV)/stdevVal;

     //For test, data *must be sorted*
     std::sort(vals.begin(),vals.end());

     //Compute the Phi distribution from the error function
     // - also compute log of this for later use
     //--
     std::vector<double> normedPhi,lonCdf;
     std::vector<bool> normedPhiOK;

     normedPhiOK.resize(n,true);
     normedPhi.resize(n);
     for(size_t ui=0;ui<n; ui++)
     {
         normedPhi[ui] = 0.5*(1.0+gsl_sf_erf(vals[ui]/sqrt(2.0)));

         if(normedPhi[ui] < std::numeric_limits<float>::epsilon())
             normedPhiOK[ui]=false;
     }

     lonCdf.resize(n);
     for(size_t ui=0;ui<n; ui++)
     {
         if(normedPhiOK[ui])
             lonCdf[ui] = log(normedPhi[ui]);
         else
             normedPhi[ui]=2.0f;  //result will imply v 1.0-normedphi[...] < 0 --> Undefined
     }
     //--

     //Compute anderson-darling statistic
     //--
     undefCount=0;
     double sumV=0.0;
     for(size_t i=0;i<n; i++)
     {
         double v;
         v=1.0-normedPhi[n-(i+1)];
         if( v > 0.0)
             sumV+=(2.0*(i+1.0)-1.0)*(lonCdf[i] + log(v));
         else
             undefCount++;
     }

     n=n-undefCount;
     //Compute A^2
     statistic=-(double)n - sumV/(double)n;

     //Perform correction of Shorack & Wellner (mean, variance unknown)
     //pp239, "Empirical Processes with Applications to Statistics"
     //doi.org/10.1137/1.9780898719017
     //Table 1, part A
     statistic*=(1.0 + 4.0/(double)n - 25/(double(n)*double(n)));

     //--


     return true;
 }


 void makeHistogram(const vector<float> &data, float start,
             float end, float step, vector<float> &histVals)
 {
     ASSERT(start < end);
     ASSERT(step > std::numeric_limits<float>::epsilon());

     gsl_histogram *h = gsl_histogram_alloc((end-start)/step);
     gsl_histogram_set_ranges_uniform(h,start,end);

     for(size_t ui=0; ui<data.size();ui++)
         gsl_histogram_increment(h,data[ui]);

     //Copy out data
     histVals.resize(h->n);
     for(size_t ui=0;ui<h->n; ui++)
         histVals[ui]=h->bin[ui];

     gsl_histogram_free(h);
 }

 string getFitErrorMsg(unsigned int errMsg)
 {
     ASSERT(errMsg < BACKGROUND_PARAMS::FIT_FAIL_END);
     const char * errorMsgs[BACKGROUND_PARAMS::FIT_FAIL_END] = {
         "",
         "Insufficient bins to perform fit (in TOF-space)",
         "Insufficient counts to perform fit (in TOF-space)",
         "Insufficient data to perform fit",
         "Data did not appear to be random noise - cannot fit noise level"
     };

     return std::string(errorMsgs[errMsg]);
 }

 unsigned int doFitBackground(const vector<float> &masses,
     BACKGROUND_PARAMS &backParams)
 {
     ASSERT(backParams.massStart < backParams.massEnd);

     vector<float> sqrtFiltMass;
     for(size_t ui=0;ui<masses.size();ui++)
     {
         if( masses[ui] >=backParams.massStart && masses[ui] <= backParams.massEnd)
             sqrtFiltMass.push_back(sqrtf(masses[ui]));
     }

     //Minimum required counts per bin to have sufficient statistics
     const unsigned int MIN_REQUIRED_AVG_COUNTS=10;
     const unsigned int MIN_REQUIRED_BINS=10;

     size_t nBinsTof = (sqrt(backParams.massEnd) - sqrt(backParams.massStart)) / backParams.binWidth;
     float filterStep = (sqrt(backParams.massEnd) - sqrt(backParams.massStart) )/ nBinsTof;

     //we cannot perform a test with fewer than this number of bins
     if ( nBinsTof < MIN_REQUIRED_BINS)
         return BACKGROUND_PARAMS::FIT_FAIL_MIN_REQ_BINS;

     //The reasoning for this test is not well-grounded.
     // It arises as we are approximating a poisson as a gaussian.
     // If this number is too small, our guassian becomes poisson shaped
     // and the anderson test is not valid to apply.
     // however we have not referenced/checked how small "too-small" really is.
     // Some testing was done, but needs to be revisited with more rigour
     float averageCounts = sqrtFiltMass.size()/ (float)nBinsTof;
     if( averageCounts < MIN_REQUIRED_AVG_COUNTS)
         return BACKGROUND_PARAMS::FIT_FAIL_AVG_COUNTS;

     //Check that the TOF-space histogram is gaussian
     vector<float> histogram;
     makeHistogram(sqrtFiltMass,sqrt(backParams.massStart),
             sqrt(backParams.massEnd), filterStep,histogram);

     float andersonStat,meanVal;
     size_t undefCount;
     //TODO: Error message regarding fit failure
     if(!andersonDarlingStatistic(histogram,meanVal,backParams.stdev,andersonStat, undefCount))
         return BACKGROUND_PARAMS::FIT_FAIL_INSUFF_DATA;

     //Rejection threshold for Anderson statistic
     // - either we didn't have enough samples,
     // - or we failed the null hypothesis test of Gaussian-ness
     // Rejection of null hypothesis at 99% confidence occurs at 1.092 [NIST].
     // we use much more than this, in case batch processing/familywise
     // error is present (i.e. we are calling this function a lot)
     // two slightly overlapping Gaussians can trigger at the 1.8 level
     const float STATISTIC_THRESHOLD=2.0;
     if(andersonStat > STATISTIC_THRESHOLD || undefCount == histogram.size())
         return BACKGROUND_PARAMS::FIT_FAIL_DATA_NON_GAUSSIAN;

     //Intensity PER BIN in TOF space (counts/time)
     backParams.intensity= meanVal;

     return 0;
 }

 //Start and end mass, and step size (to get bin count).
 // tofBackIntensity is the intensity level per unit time in the background, as obtained by doFitBackground
 // the histogram is
 void createMassBackground(float massStart, float massEnd, unsigned int nBinsMass,
             float tofBackIntensity, vector<float> &histogram)
 {
     const float MC_BIN_STEP = (massEnd-massStart)/nBinsMass;

     //compute fitted value analytically
     histogram.resize(nBinsMass);
     for(size_t ui=0;ui<histogram.size();ui++)
     {
         float mcX;
         mcX=(float)(ui)*MC_BIN_STEP+ massStart;
         if ( mcX <=0)
             histogram[ui]=0;
         else
         {
             float mLow=mcX;
             float mHigh=mcX+MC_BIN_STEP;

             //This is the discrete approximation to the area under a 1/sqt
             histogram[ui] = tofBackIntensity*(sqrt(mHigh) - sqrt(mLow));

         }
     }
 }

 void diff(const vector<float>  & in, vector<float>& out)
 {
 #define USE_CENTRAL
 #ifdef USE_CENTRAL
     //2nd order central
     out.resize(in.size()-1);
     for(auto i=1u; i<in.size()-1; ++i)
         out[i-1] = 0.5*(in[i+1] - in[i-1]);

     //First-order
     out[out.size()-1] = in[in.size()-1] - in[in.size()-2];
 #else
     //This computes a first upwind differential (x'_i = x_(i+1)-x_i)
     out.resize(in.size()-1);
     for(auto i=1u; i<in.size(); ++i)
         out[i-1] = in[i] - in[i-1];
 #endif
 }

 void findPeaks(const vector<float> &x0, vector<unsigned int>& peakInds, float sel,
         bool autoSel,bool includeEndpoints)
 {
     if(x0.size() < 2)
         return;

     //Heuristically select the cutoff threshold
     if(autoSel)
     {
         auto p = std::minmax_element(x0.begin(),x0.end());

         size_t minIdx=std::distance(x0.begin(),p.first);
         size_t maxIdx=std::distance(x0.begin(),p.second);

         sel = (x0[maxIdx]-x0[minIdx])/4.0;
     }

     //Compute derivative
     vector<float> dx0;
     diff(x0,dx0);

     //Adjust values so repeated values are not 0 derivative
     for(auto &f : dx0)
     {
         if(f  ==0)
             f = std::numeric_limits<float>::epsilon();
     }

     //indices of potential peaks
     vector<unsigned int> ind;

     //Push padding value if needed
     if(includeEndpoints)
         ind.push_back(0);

     //Find the position to the right,
     // where the derivative changes sign
     for(auto ui=0u;ui<dx0.size()-1;ui++)
     {
         if(std::signbit(dx0[ui]) !=std::signbit(dx0[ui+1]))
             ind.push_back(ui+1);
     }

     float leftMin, minMag;

     vector<float> x;
     if(includeEndpoints)
     {
         //"bookend" values by exnteding constant value
         x.resize(ind.size()+2);
         x[0]=x0[0];
         for(auto i=0u; i<ind.size()-1; i++)
         {
             //Note +1 due to previous push_back of padding value
             x[i+1] = x0[ind[i+1]];
         }
         x[x.size()-1]  =x0[x0.size()-1];

         leftMin=minMag = *std::min_element(x.begin(),x.end());

         ind.push_back(x0.size()-1);

     }
     else
     {

         selectElements(x0,ind,x);
         minMag=*std::min_element(x.begin(),x.end());
         leftMin=std::min(x[0],x0[0]);
     }


     if(x.size() <=2)
         return;

     //Set up initial loop parameters
     float tempMag= minMag;

     if(includeEndpoints)
     {
         vector<float> xSub = {x[0],x[1],x[2]};
         vector<float> signDx;
         diff(xSub,signDx);
         for(auto &f : signDx)
             f = std::copysign(1.0f,f);


         if(signDx[0] <=0)
         {
             if(signDx[0] == signDx[1])
             {
                 x.erase(x.begin()+1);
                 ind.erase(ind.begin()+1);
             }
         }
         else
         {
             if(signDx[0] == signDx[1])
             {
                 x.erase(x.begin());
                 ind.erase(ind.begin());
             }
         }


     }

     unsigned int ii = (x[0] >=x[1]) ? 0 : 1;
     vector<unsigned int> peakLoc;
     bool foundPeak=false;

     unsigned int tempLoc;
     while( ii < x.size()-1)
     {
         ii++;

         //Reset peak finding if we had a peak and the next is bigger
         // Than the last, or the left min was small enough to reset
         if(foundPeak)
         {
             tempMag=minMag;
             foundPeak=false;
         }


         // Check for new peak larger than the tmeporary size
         // and selectivity larger than the minimum to its left
         if( x[ii-1] > tempMag && x[ii-1] > leftMin + sel)
         {
             tempLoc=ii-1;
             tempMag=x[ii-1];
         }

         if(ii == x.size()-1)
             break;

         ii++;

         //Come down at least sel from peak
         if( !foundPeak && tempMag > sel + x[ii-1])
         {
             foundPeak=true; //we have  apeak
             leftMin = x[ii-1];
             peakLoc.push_back(tempLoc);
         }
         else
             leftMin = x[ii-1];

     }

     if(includeEndpoints)
     {
         if(x[x.size()-1] > tempMag && x[x.size()-1] > leftMin + sel)
             peakLoc.push_back(x.size()-1);
         else if (!foundPeak && tempMag > minMag)
             peakLoc.push_back(tempLoc);
     }
     else if (!foundPeak)
     {
         if(x[x.size()-1] >tempMag && x[x.size()-1] > leftMin + sel)
             peakLoc.push_back(x.size()-1);
         else if( tempMag > std::min(x0[x0.size()-1],x[x.size()-1]) + sel)
             peakLoc.push_back(tempLoc);
     }

     peakInds.resize(peakLoc.size());
     for(auto i=0u; i< peakLoc.size();i++)
         peakInds[i]=ind[peakLoc[i]];

 }


 #ifdef DEBUG

 bool testBackgroundFitMaths()
 {
     // Seed with a real random value, if available
         std::random_device r;
     std::mt19937 gen(r());
     std::uniform_real_distribution<> dist(0.0, 1.0);

     const unsigned int NUM_IONS =100000;

     //Simulate a histogram of NUM_IONS
     // between a lower and upper limit.
     // This is flat in TOF space, with mean intensity
     // given by NUM_IONS/NUM_BINS
     //---
     const float TOF_LIMIT[2] = { 0.0,100};

     vector<float> rawData;
     rawData.resize(NUM_IONS);
     for(size_t ui=0;ui<NUM_IONS; ui++)
     {
         float simTof;
         simTof = dist(gen)*(TOF_LIMIT[1]-TOF_LIMIT[0] ) + TOF_LIMIT[0];
         rawData[ui] = simTof;
     }


     //Now perform the fit in m/c space, and after, check that it matches the anticipated m/c histogram.
     //---

     //compute the mass histogram numerically
     vector<float> massData;
     massData.resize(NUM_IONS);
     for(size_t ui=0;ui<NUM_IONS;ui++)
         massData[ui] = rawData[ui]*rawData[ui];
     vector<float> massHist;

     //Recompute the bin step parameter, as the stepping in m/c space to yield
     // the same number of bins will e radially different
     const float NBINS_TOF = 2000;
     const float NBINS_MASS= NBINS_TOF;
     const float MASS_LIMIT[2] =  {TOF_LIMIT[0]*TOF_LIMIT[0], TOF_LIMIT[1]*TOF_LIMIT[1]};


     //time-space intensity per unit time
     const float TOF_MEAN_INT= NUM_IONS/(TOF_LIMIT[1] - TOF_LIMIT[0]);

     const float MC_BIN_STEP = (MASS_LIMIT[1]-MASS_LIMIT[0])/NBINS_MASS;
     makeHistogram(massData,MASS_LIMIT[0],MASS_LIMIT[1],MC_BIN_STEP,massHist);

     //compute fitted value analytically
     vector<float > fittedMassHist;
     createMassBackground(MASS_LIMIT[0],MASS_LIMIT[1],NBINS_MASS,
                     TOF_MEAN_INT,fittedMassHist);

     double accum=0;

     //check that the numerical and analytical results match.
     // notably, skip the first one as the fit is unstable near 0 mass
     for(size_t ui=1;ui<massHist.size();ui++)
     {
         float midV;
         midV = massHist[ui] + fittedMassHist[ui];
         midV*=0.5f;

         accum+=(massHist[ui]-fittedMassHist[ui]);
     }

     //Check that average error is small
     ASSERT(accum/(float)massHist.size() < 0.05f);

     //---

     return true;
  }

 #endif

 }
AtomProbe::BACKGROUND_PARAMS::FIT_FAIL_DATA_NON_GAUSSIAN
Definition: processing.h:35

AtomProbe::BACKGROUND_PARAMS::massEnd
float massEnd
Definition: processing.h:41

NUM_IONS
const unsigned int NUM_IONS
Definition: KDTest.cpp:26

misc.h

AtomProbe::getFitErrorMsg
std::string getFitErrorMsg(unsigned int errCode)
Definition: processing.cpp:146

AtomProbe::BACKGROUND_PARAMS::FIT_FAIL_END
Definition: processing.h:36

AtomProbe::makeHistogram
void makeHistogram(const vector< float > &data, float start, float end, float step, vector< float > &histVals)
Definition: processing.cpp:126

AtomProbe::selectElements
void selectElements(const std::vector< T > &in, const std::vector< unsigned int > &indices, std::vector< T > &out)
Obtain the elements at positions indicies in the input vector, copy to output.
Definition: misc.h:67

AtomProbe::diff
void diff(const vector< float > &in, vector< float > &out)
Definition: processing.cpp:249

AtomProbe::BACKGROUND_PARAMS
Definition: processing.h:28

AtomProbe::meanAndStdev
void meanAndStdev(const std::vector< T > &f, float &meanVal, float &stdevVal, bool normalCorrection=true)
Definition: misc.h:76

AtomProbe::doFitBackground
unsigned int doFitBackground(const std::vector< float > &massData, BACKGROUND_PARAMS &params)
Perform a background fit, assuming constant TOF noise, from 0->inf time.
Definition: processing.cpp:160

AtomProbe::andersonDarlingStatistic
bool andersonDarlingStatistic(std::vector< T > vals, float &meanV, float &stdevVal, float &statistic, size_t &undefCount, bool computeMeanAndStdev=true)
Definition: processing.cpp:51

AtomProbe::BACKGROUND_PARAMS::FIT_FAIL_INSUFF_DATA
Definition: processing.h:34

AtomProbe::BACKGROUND_PARAMS::intensity
float intensity
Definition: processing.h:46

AtomProbe::BACKGROUND_PARAMS::FIT_FAIL_AVG_COUNTS
Definition: processing.h:33

AtomProbe::findPeaks
void findPeaks(const std::vector< float > &x0, std::vector< unsigned int > &peakInds, float sel, bool autoSel=true, bool includeEndpoints=true)
Simple peak-finding algorithm.
Definition: processing.cpp:268

AtomProbe::BACKGROUND_PARAMS::binWidth
float binWidth
Definition: processing.h:43

AtomProbe
Definition: axialdf.h:24

AtomProbe::BACKGROUND_PARAMS::massStart
float massStart
Definition: processing.h:41

AtomProbe::createMassBackground
void createMassBackground(float massStart, float massEnd, unsigned int nBinsMass, float tofBackIntensity, std::vector< float > &histogram)
Build a histogram of the background counts.
Definition: processing.cpp:224

AtomProbe::BACKGROUND_PARAMS::FIT_FAIL_MIN_REQ_BINS
Definition: processing.h:32

aptAssert.h

ASSERT
#define ASSERT(f)
Definition: zechBackground.cpp:26

misc.h

AtomProbe::BACKGROUND_PARAMS::stdev
float stdev
Definition: processing.h:46

processing.h