libatomprobe
Library for Atom Probe Tomography (APT) computation
processing.cpp
Go to the documentation of this file.
1 /* processing.cpp : Processing mass spectra - statistical and numerical fucntions
2  * Copyright (C) 2017 Daniel Haley
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 #include <vector>
18 #include <algorithm>
19 #include <cmath>
20 #include <limits>
21 #include <string>
22 #include <random>
23 
24 #include <gsl/gsl_sf_erf.h>
25 #include <gsl/gsl_histogram.h>
26 
28 #include "atomprobe/helper/misc.h"
30 
32 
33 using std::string;
34 using std::vector;
35 
36 namespace AtomProbe
37 {
38 
39 
40 //Anderson. test statistic for gaussian-ness. Returns false if input has insufficient points for test (2 items)
41 //Implented for unknown (derived from data) mean & variance
42 // reject statistic if output has this prob. of non-normality:
43 // 15% - 0.576
44 // 10% - 0.656
45 // 5% - 0.787
46 //2.5% - 0.918
47 // 1% - 1.092
48 //See, eg
49 // http://itl.nist.gov/div898/handbook/eda/section3/eda35e.htm
50 template<class T>
51 bool andersonDarlingStatistic(std::vector<T> vals, float &meanV, float &stdevVal,
52  float &statistic, size_t &undefCount, bool computeMeanAndStdev=true)
53 {
54  size_t n=vals.size();
55  //we cannot compute this without more data
56  if(n <= 1)
57  return false;
58 
59  if(computeMeanAndStdev)
60  meanAndStdev(vals,meanV,stdevVal);
61 
62  //Bring assumed gauss data into a normal dist
63  for(size_t ui=0;ui<n;ui++)
64  vals[ui]=(vals[ui]-meanV)/stdevVal;
65 
66  //For test, data *must be sorted*
67  std::sort(vals.begin(),vals.end());
68 
69  //Compute the Phi distribution from the error function
70  // - also compute log of this for later use
71  //--
72  std::vector<double> normedPhi,lonCdf;
73  std::vector<bool> normedPhiOK;
74 
75  normedPhiOK.resize(n,true);
76  normedPhi.resize(n);
77  for(size_t ui=0;ui<n; ui++)
78  {
79  normedPhi[ui] = 0.5*(1.0+gsl_sf_erf(vals[ui]/sqrt(2.0)));
80 
81  if(normedPhi[ui] < std::numeric_limits<float>::epsilon())
82  normedPhiOK[ui]=false;
83  }
84 
85  lonCdf.resize(n);
86  for(size_t ui=0;ui<n; ui++)
87  {
88  if(normedPhiOK[ui])
89  lonCdf[ui] = log(normedPhi[ui]);
90  else
91  normedPhi[ui]=2.0f; //result will imply v 1.0-normedphi[...] < 0 --> Undefined
92  }
93  //--
94 
95  //Compute anderson-darling statistic
96  //--
97  undefCount=0;
98  double sumV=0.0;
99  for(size_t i=0;i<n; i++)
100  {
101  double v;
102  v=1.0-normedPhi[n-(i+1)];
103  if( v > 0.0)
104  sumV+=(2.0*(i+1.0)-1.0)*(lonCdf[i] + log(v));
105  else
106  undefCount++;
107  }
108 
109  n=n-undefCount;
110  //Compute A^2
111  statistic=-(double)n - sumV/(double)n;
112 
113  //Perform correction of Shorack & Wellner (mean, variance unknown)
114  //pp239, "Empirical Processes with Applications to Statistics"
115  //doi.org/10.1137/1.9780898719017
116  //Table 1, part A
117  statistic*=(1.0 + 4.0/(double)n - 25/(double(n)*double(n)));
118 
119  //--
120 
121 
122  return true;
123 }
124 
125 
126 void makeHistogram(const vector<float> &data, float start,
127  float end, float step, vector<float> &histVals)
128 {
129  ASSERT(start < end);
130  ASSERT(step > std::numeric_limits<float>::epsilon());
131 
132  gsl_histogram *h = gsl_histogram_alloc((end-start)/step);
133  gsl_histogram_set_ranges_uniform(h,start,end);
134 
135  for(size_t ui=0; ui<data.size();ui++)
136  gsl_histogram_increment(h,data[ui]);
137 
138  //Copy out data
139  histVals.resize(h->n);
140  for(size_t ui=0;ui<h->n; ui++)
141  histVals[ui]=h->bin[ui];
142 
143  gsl_histogram_free(h);
144 }
145 
146 string getFitErrorMsg(unsigned int errMsg)
147 {
149  const char * errorMsgs[BACKGROUND_PARAMS::FIT_FAIL_END] = {
150  "",
151  "Insufficient bins to perform fit (in TOF-space)",
152  "Insufficient counts to perform fit (in TOF-space)",
153  "Insufficient data to perform fit",
154  "Data did not appear to be random noise - cannot fit noise level"
155  };
156 
157  return std::string(errorMsgs[errMsg]);
158 }
159 
160 unsigned int doFitBackground(const vector<float> &masses,
161  BACKGROUND_PARAMS &backParams)
162 {
163  ASSERT(backParams.massStart < backParams.massEnd);
164 
165  vector<float> sqrtFiltMass;
166  for(size_t ui=0;ui<masses.size();ui++)
167  {
168  if( masses[ui] >=backParams.massStart && masses[ui] <= backParams.massEnd)
169  sqrtFiltMass.push_back(sqrtf(masses[ui]));
170  }
171 
172  //Minimum required counts per bin to have sufficient statistics
173  const unsigned int MIN_REQUIRED_AVG_COUNTS=10;
174  const unsigned int MIN_REQUIRED_BINS=10;
175 
176  size_t nBinsTof = (sqrt(backParams.massEnd) - sqrt(backParams.massStart)) / backParams.binWidth;
177  float filterStep = (sqrt(backParams.massEnd) - sqrt(backParams.massStart) )/ nBinsTof;
178 
179  //we cannot perform a test with fewer than this number of bins
180  if ( nBinsTof < MIN_REQUIRED_BINS)
182 
183  //The reasoning for this test is not well-grounded.
184  // It arises as we are approximating a poisson as a gaussian.
185  // If this number is too small, our guassian becomes poisson shaped
186  // and the anderson test is not valid to apply.
187  // however we have not referenced/checked how small "too-small" really is.
188  // Some testing was done, but needs to be revisited with more rigour
189  float averageCounts = sqrtFiltMass.size()/ (float)nBinsTof;
190  if( averageCounts < MIN_REQUIRED_AVG_COUNTS)
192 
193  //Check that the TOF-space histogram is gaussian
194  vector<float> histogram;
195  makeHistogram(sqrtFiltMass,sqrt(backParams.massStart),
196  sqrt(backParams.massEnd), filterStep,histogram);
197 
198  float andersonStat,meanVal;
199  size_t undefCount;
200  //TODO: Error message regarding fit failure
201  if(!andersonDarlingStatistic(histogram,meanVal,backParams.stdev,andersonStat, undefCount))
203 
204  //Rejection threshold for Anderson statistic
205  // - either we didn't have enough samples,
206  // - or we failed the null hypothesis test of Gaussian-ness
207  // Rejection of null hypothesis at 99% confidence occurs at 1.092 [NIST].
208  // we use much more than this, in case batch processing/familywise
209  // error is present (i.e. we are calling this function a lot)
210  // two slightly overlapping Gaussians can trigger at the 1.8 level
211  const float STATISTIC_THRESHOLD=2.0;
212  if(andersonStat > STATISTIC_THRESHOLD || undefCount == histogram.size())
214 
215  //Intensity PER BIN in TOF space (counts/time)
216  backParams.intensity= meanVal;
217 
218  return 0;
219 }
220 
221 //Start and end mass, and step size (to get bin count).
222 // tofBackIntensity is the intensity level per unit time in the background, as obtained by doFitBackground
223 // the histogram is
224 void createMassBackground(float massStart, float massEnd, unsigned int nBinsMass,
225  float tofBackIntensity, vector<float> &histogram)
226 {
227  const float MC_BIN_STEP = (massEnd-massStart)/nBinsMass;
228 
229  //compute fitted value analytically
230  histogram.resize(nBinsMass);
231  for(size_t ui=0;ui<histogram.size();ui++)
232  {
233  float mcX;
234  mcX=(float)(ui)*MC_BIN_STEP+ massStart;
235  if ( mcX <=0)
236  histogram[ui]=0;
237  else
238  {
239  float mLow=mcX;
240  float mHigh=mcX+MC_BIN_STEP;
241 
242  //This is the discrete approximation to the area under a 1/sqt
243  histogram[ui] = tofBackIntensity*(sqrt(mHigh) - sqrt(mLow));
244 
245  }
246  }
247 }
248 
249 void diff(const vector<float> & in, vector<float>& out)
250 {
251 #define USE_CENTRAL
252 #ifdef USE_CENTRAL
253  //2nd order central
254  out.resize(in.size()-1);
255  for(auto i=1u; i<in.size()-1; ++i)
256  out[i-1] = 0.5*(in[i+1] - in[i-1]);
257 
258  //First-order
259  out[out.size()-1] = in[in.size()-1] - in[in.size()-2];
260 #else
261  //This computes a first upwind differential (x'_i = x_(i+1)-x_i)
262  out.resize(in.size()-1);
263  for(auto i=1u; i<in.size(); ++i)
264  out[i-1] = in[i] - in[i-1];
265 #endif
266 }
267 
268 void findPeaks(const vector<float> &x0, vector<unsigned int>& peakInds, float sel,
269  bool autoSel,bool includeEndpoints)
270 {
271  if(x0.size() < 2)
272  return;
273 
274  //Heuristically select the cutoff threshold
275  if(autoSel)
276  {
277  auto p = std::minmax_element(x0.begin(),x0.end());
278 
279  size_t minIdx=std::distance(x0.begin(),p.first);
280  size_t maxIdx=std::distance(x0.begin(),p.second);
281 
282  sel = (x0[maxIdx]-x0[minIdx])/4.0;
283  }
284 
285  //Compute derivative
286  vector<float> dx0;
287  diff(x0,dx0);
288 
289  //Adjust values so repeated values are not 0 derivative
290  for(auto &f : dx0)
291  {
292  if(f ==0)
293  f = std::numeric_limits<float>::epsilon();
294  }
295 
296  //indices of potential peaks
297  vector<unsigned int> ind;
298 
299  //Push padding value if needed
300  if(includeEndpoints)
301  ind.push_back(0);
302 
303  //Find the position to the right,
304  // where the derivative changes sign
305  for(auto ui=0u;ui<dx0.size()-1;ui++)
306  {
307  if(std::signbit(dx0[ui]) !=std::signbit(dx0[ui+1]))
308  ind.push_back(ui+1);
309  }
310 
311  float leftMin, minMag;
312 
313  vector<float> x;
314  if(includeEndpoints)
315  {
316  //"bookend" values by exnteding constant value
317  x.resize(ind.size()+2);
318  x[0]=x0[0];
319  for(auto i=0u; i<ind.size()-1; i++)
320  {
321  //Note +1 due to previous push_back of padding value
322  x[i+1] = x0[ind[i+1]];
323  }
324  x[x.size()-1] =x0[x0.size()-1];
325 
326  leftMin=minMag = *std::min_element(x.begin(),x.end());
327 
328  ind.push_back(x0.size()-1);
329 
330  }
331  else
332  {
333 
334  selectElements(x0,ind,x);
335  minMag=*std::min_element(x.begin(),x.end());
336  leftMin=std::min(x[0],x0[0]);
337  }
338 
339 
340  if(x.size() <=2)
341  return;
342 
343  //Set up initial loop parameters
344  float tempMag= minMag;
345 
346  if(includeEndpoints)
347  {
348  vector<float> xSub = {x[0],x[1],x[2]};
349  vector<float> signDx;
350  diff(xSub,signDx);
351  for(auto &f : signDx)
352  f = std::copysign(1.0f,f);
353 
354 
355  if(signDx[0] <=0)
356  {
357  if(signDx[0] == signDx[1])
358  {
359  x.erase(x.begin()+1);
360  ind.erase(ind.begin()+1);
361  }
362  }
363  else
364  {
365  if(signDx[0] == signDx[1])
366  {
367  x.erase(x.begin());
368  ind.erase(ind.begin());
369  }
370  }
371 
372 
373  }
374 
375  unsigned int ii = (x[0] >=x[1]) ? 0 : 1;
376  vector<unsigned int> peakLoc;
377  bool foundPeak=false;
378 
379  unsigned int tempLoc;
380  while( ii < x.size()-1)
381  {
382  ii++;
383 
384  //Reset peak finding if we had a peak and the next is bigger
385  // Than the last, or the left min was small enough to reset
386  if(foundPeak)
387  {
388  tempMag=minMag;
389  foundPeak=false;
390  }
391 
392 
393  // Check for new peak larger than the tmeporary size
394  // and selectivity larger than the minimum to its left
395  if( x[ii-1] > tempMag && x[ii-1] > leftMin + sel)
396  {
397  tempLoc=ii-1;
398  tempMag=x[ii-1];
399  }
400 
401  if(ii == x.size()-1)
402  break;
403 
404  ii++;
405 
406  //Come down at least sel from peak
407  if( !foundPeak && tempMag > sel + x[ii-1])
408  {
409  foundPeak=true; //we have apeak
410  leftMin = x[ii-1];
411  peakLoc.push_back(tempLoc);
412  }
413  else
414  leftMin = x[ii-1];
415 
416  }
417 
418  if(includeEndpoints)
419  {
420  if(x[x.size()-1] > tempMag && x[x.size()-1] > leftMin + sel)
421  peakLoc.push_back(x.size()-1);
422  else if (!foundPeak && tempMag > minMag)
423  peakLoc.push_back(tempLoc);
424  }
425  else if (!foundPeak)
426  {
427  if(x[x.size()-1] >tempMag && x[x.size()-1] > leftMin + sel)
428  peakLoc.push_back(x.size()-1);
429  else if( tempMag > std::min(x0[x0.size()-1],x[x.size()-1]) + sel)
430  peakLoc.push_back(tempLoc);
431  }
432 
433  peakInds.resize(peakLoc.size());
434  for(auto i=0u; i< peakLoc.size();i++)
435  peakInds[i]=ind[peakLoc[i]];
436 
437 }
438 
439 
440 #ifdef DEBUG
441 
442 bool testBackgroundFitMaths()
443 {
444  // Seed with a real random value, if available
445  std::random_device r;
446  std::mt19937 gen(r());
447  std::uniform_real_distribution<> dist(0.0, 1.0);
448 
449  const unsigned int NUM_IONS =100000;
450 
451  //Simulate a histogram of NUM_IONS
452  // between a lower and upper limit.
453  // This is flat in TOF space, with mean intensity
454  // given by NUM_IONS/NUM_BINS
455  //---
456  const float TOF_LIMIT[2] = { 0.0,100};
457 
458  vector<float> rawData;
459  rawData.resize(NUM_IONS);
460  for(size_t ui=0;ui<NUM_IONS; ui++)
461  {
462  float simTof;
463  simTof = dist(gen)*(TOF_LIMIT[1]-TOF_LIMIT[0] ) + TOF_LIMIT[0];
464  rawData[ui] = simTof;
465  }
466 
467 
468 
469 
470  //Now perform the fit in m/c space, and after, check that it matches the anticipated m/c histogram.
471  //---
472 
473  //compute the mass histogram numerically
474  vector<float> massData;
475  massData.resize(NUM_IONS);
476  for(size_t ui=0;ui<NUM_IONS;ui++)
477  massData[ui] = rawData[ui]*rawData[ui];
478  vector<float> massHist;
479 
480  //Recompute the bin step parameter, as the stepping in m/c space to yield
481  // the same number of bins will e radially different
482  const float NBINS_TOF = 2000;
483  const float NBINS_MASS= NBINS_TOF;
484  const float MASS_LIMIT[2] = {TOF_LIMIT[0]*TOF_LIMIT[0], TOF_LIMIT[1]*TOF_LIMIT[1]};
485 
486 
487  //time-space intensity per unit time
488  const float TOF_MEAN_INT= NUM_IONS/(TOF_LIMIT[1] - TOF_LIMIT[0]);
489 
490  const float MC_BIN_STEP = (MASS_LIMIT[1]-MASS_LIMIT[0])/NBINS_MASS;
491  makeHistogram(massData,MASS_LIMIT[0],MASS_LIMIT[1],MC_BIN_STEP,massHist);
492 
493  //compute fitted value analytically
494  vector<float > fittedMassHist;
495  createMassBackground(MASS_LIMIT[0],MASS_LIMIT[1],NBINS_MASS,
496  TOF_MEAN_INT,fittedMassHist);
497 
498  double accum=0;
499 
500  //check that the numerical and analytical results match.
501  // notably, skip the first one as the fit is unstable near 0 mass
502  for(size_t ui=1;ui<massHist.size();ui++)
503  {
504  float midV;
505  midV = massHist[ui] + fittedMassHist[ui];
506  midV*=0.5f;
507 
508  accum+=(massHist[ui]-fittedMassHist[ui]);
509  }
510 
511  //Check that average error is small
512  ASSERT(accum/(float)massHist.size() < 0.05f);
513 
514  //---
515 
516  return true;
517  }
518 
519 #endif
520 
521 }
const unsigned int NUM_IONS
Definition: KDTest.cpp:26
std::string getFitErrorMsg(unsigned int errCode)
Definition: processing.cpp:146
void makeHistogram(const vector< float > &data, float start, float end, float step, vector< float > &histVals)
Definition: processing.cpp:126
void selectElements(const std::vector< T > &in, const std::vector< unsigned int > &indices, std::vector< T > &out)
Obtain the elements at positions indicies in the input vector, copy to output.
Definition: misc.h:67
void diff(const vector< float > &in, vector< float > &out)
Definition: processing.cpp:249
void meanAndStdev(const std::vector< T > &f, float &meanVal, float &stdevVal, bool normalCorrection=true)
Definition: misc.h:76
unsigned int doFitBackground(const std::vector< float > &massData, BACKGROUND_PARAMS &params)
Perform a background fit, assuming constant TOF noise, from 0->inf time.
Definition: processing.cpp:160
bool andersonDarlingStatistic(std::vector< T > vals, float &meanV, float &stdevVal, float &statistic, size_t &undefCount, bool computeMeanAndStdev=true)
Definition: processing.cpp:51
void findPeaks(const std::vector< float > &x0, std::vector< unsigned int > &peakInds, float sel, bool autoSel=true, bool includeEndpoints=true)
Simple peak-finding algorithm.
Definition: processing.cpp:268
void createMassBackground(float massStart, float massEnd, unsigned int nBinsMass, float tofBackIntensity, std::vector< float > &histogram)
Build a histogram of the background counts.
Definition: processing.cpp:224
#define ASSERT(f)