libatomprobe
Library for Atom Probe Tomography (APT) computation
filter.cpp
Go to the documentation of this file.
1 /* filter.cpp : Mass spectrum candidate filtering
2  * Copyright (C) 2020 Daniel Haley
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
18 
20 #include "atomprobe/helper/misc.h"
21 
22 #include <limits>
23 #include <cmath>
24 #include <utility>
25 
26 
27 namespace AtomProbe{
28 
29 using std::pair;
30 using std::vector;
31 using std::string;
32 using std::map;
33 
34 
35 
36 unsigned int countIntensityEvents(const vector<pair<float,float> > &data, float minV, float maxV)
37 {
38  unsigned int n=0;
39 #pragma omp parallel for reduction(+:n)
40  for(unsigned int ui=0;ui<data.size();ui++)
41  {
42  if(data[ui].first >=minV && data[ui].first < maxV)
43  n+=data[ui].second;
44  }
45 
46  return n;
47 }
48 
49 
50 void buildFrequencyTable(const vector<ISOTOPE_ENTRY> &solutionVec,
51  vector<size_t> &solutionElements, vector<size_t> &solutionFrequency)
52 {
53  ASSERT(solutionVec.size());
54 
55  //Count number of times we have seen each element
56  map<size_t,size_t> solutionCount;
57  solutionCount.clear();
58  for(const auto & soln : solutionVec)
59  {
60  map<size_t,size_t>::iterator it;
61  it=solutionCount.find(soln.atomicNumber-1);
62 
63  //Check to see if we have encountered this
64  //before, if so accumulate our count
65  if(it==solutionCount.end())
66  solutionCount[soln.atomicNumber-1] =1;
67  else
68  it->second++;
69  }
70 
71  //erase the solution elements, as we have now counted duplicates
72  // and restructure so we have [ element, frequency ] in the two vectors
73  solutionFrequency.clear(); solutionElements.clear();
74  for(map<size_t,size_t>::const_iterator it = solutionCount.begin(); it!=solutionCount.end(); ++it)
75  {
76  solutionElements.push_back(it->first);
77  solutionFrequency.push_back(it->second);
78  }
79 }
80 
81 
82 //FIXME: Algorithm is a little out - we need to sum abundances within a small tolerance window, rather than match exactly to isotope abundance
83 void filterBySolutionPPM(const AbundanceData &massTable, float minPpm, vector<vector<ISOTOPE_ENTRY> > &solutions)
84 {
85  vector<float> solutionProbabilities;
86  solutionProbabilities.resize(solutions.size());
87 
88  vector<bool> killVec;
89  killVec.resize(solutions.size(),false);
90 
91  for(size_t ui=0;ui<solutions.size();ui++)
92  {
93  //Compute the effective mass fraction of this isotope,
94  // assuming the entire material is only made up of this combination
95  float solutionComp;
96  solutionComp=1.0f;
97  for(auto & soln : solutions[ui])
98  {
99  //Get the isotope and element ID for this mass
100  pair<size_t,size_t> tmp;
101  size_t elementIdx,isotopeIdx;
102  elementIdx= soln.massNumber-1;
103  isotopeIdx= soln.atomicNumber-1;
104 
105  solutionComp*=massTable.isotope(elementIdx,
106  isotopeIdx).abundance;
107  }
108 
109  //Decide if the solution is to be kept or not,
110  // based upon the theoretical ppm, and desired
111  killVec[ui] = solutionComp*1e6 < minPpm;
112  }
113 
114  //Trim the solutions
115  vectorMultiErase(solutions,killVec);
116 }
117 
118 
120  const vector<float > &peakPosition, float tolerance,
121  size_t solutionCharge,vector<vector<ISOTOPE_ENTRY> > &solutions)
122 {
123 
124 
125  //We need to scan the solutions, look at each peak and work out
126  // if there are any larger peaks in the list of possible solution
127  vector<bool> cullSolutions;
128  cullSolutions.resize(solutions.size(),false);
129 
130  for(size_t ui=0;ui<solutions.size();ui++)
131  {
132 
133  vector<pair<float,float> > massDist;
134  //Compute the mass distribution from the element mass chain
135  //==
136  {
137 
138  //Convert the list of masses into a [ element, freq] listing
139  vector<size_t> solutionElements;
140  vector<size_t> solutionFreq;
141  buildFrequencyTable(solutions[ui],
142  solutionElements,solutionFreq);
143 
144 
145  //Compute theoretical peak intensity distribution
146  //--
147  massTable.generateIsotopeDist(solutionElements,
148  solutionFreq,massDist);
149 
150  for(unsigned int uj=0;uj<massDist.size();uj++)
151  massDist[uj].first/=solutionCharge;
152  //--
153  }
154  //==
155 
156  //Eliminate any peaks which in have smaller amplitude
157  // from the distribution
158  //--
159  //a ) Find our solution's total mass,
160  float solutionMass;
161  solutionMass=0;
162  for(size_t uj=0;uj<solutions[ui].size();uj++)
163  solutionMass+=solutions[ui][uj].mass;
164 
165  //b) eliminate.First find the if there is an in-range
166  float solutionNaturalAmp=-1;
167  for(size_t uj=0; uj<massDist.size();uj++)
168  {
169  if(fabs(massDist[uj].first - solutionMass) < sqrt(std::numeric_limits<float>::epsilon()))
170  {
171  solutionNaturalAmp=massDist[uj].second;
172  break;
173 
174  }
175  }
176 
177  ASSERT(solutionNaturalAmp>=0.0f);
178 
179  vector<bool> killVec;
180  killVec.resize(massDist.size());
181  for(size_t uj=0;uj<massDist.size();uj++)
182  killVec[uj] = (massDist[uj].second <= solutionNaturalAmp);
183 
184  vectorMultiErase(massDist,killVec);
185  //--
186 
187  //Now check to make sure that *all* peaks above our
188  // solution size exist, marking solutions to be culled
189  // if not all larger peaks can be found in the input
190  for(size_t uj=0;uj<massDist.size();uj++)
191  {
192  bool found;
193  found=false;
194  for(size_t uk=0; uk<peakPosition.size();uk++)
195  {
196  if(fabs(peakPosition[uk]-massDist[uj].first) < tolerance)
197  {
198  found=true;
199  break;
200  }
201  }
202 
203  //If a larger peak could not be found
204  // in the observed distribution,
205  // then this solution is invalid
206  if(!found)
207  {
208  cullSolutions[ui]=true;
209  break;
210  }
211  }
212 
213 
214  }
215 
216  //Cull the solution vector
217  vectorMultiErase(solutions,cullSolutions);
218 
219 }
220 
221 vector<float> maxExplainedFraction(const vector<pair<float,float> > &intensityData, float peakMass, float massWidth,
222  const vector<vector<ISOTOPE_ENTRY> > &solutions, const AbundanceData &massTable,
223  float massDistTol, unsigned int solutionCharge)
224 {
225 
226  //Step 1: Find the solution that we are interested in.
227  vector<pair<float,float> > massDist;
228 
229  vector<float> explainedFractions;
230  explainedFractions.resize(solutions.size());
231 
232  for(size_t ui=0;ui<solutions.size();ui++)
233  {
234 
235  //Convert the list of masses into a [ element, freq] listing
236  vector<size_t> solutionElements;
237  vector<size_t> solutionFreq;
238  buildFrequencyTable(solutions[ui],
239  solutionElements,solutionFreq);
240 
241 
242  //Compute theoretical peak intensity distribution.
243  // use the grouped version, to clump similar massed
244  // isotopes together.
245  //--
246  massTable.generateGroupedIsotopeDist(solutionElements,
247  solutionFreq,massDist,massDistTol);
248 
249  //Correct for solution charge.
250  for(unsigned int uj=0;uj<massDist.size();uj++)
251  massDist[uj].first/=solutionCharge;
252  //--
253 
254  //For each range, we need to compute the number of counts in that range
255 
256  unsigned int closestPeak;
257  float massErr;
258  massErr = std::numeric_limits<float>::max();
259  closestPeak=(unsigned int)-1;
260 
261  vector<float> observedCounts;
262  observedCounts.resize(massDist.size());
263  for(unsigned int uj=0;uj<massDist.size();uj++)
264  {
265  observedCounts[uj] =countIntensityEvents(intensityData,massDist[uj].first-massWidth, massDist[uj].first+massWidth);
266 
267  //Find the best-matching peak from the theoretical distribution
268  // to our target peak
269  float massErrTmp;
270  massErrTmp = fabs(massDist[uj].first - peakMass);
271  if(massErrTmp < massErr)
272  {
273  massErr = massErrTmp;
274  closestPeak=uj;
275  }
276 
277  //FIXME: Add some extra counts in order to acommodate our confidence level
278  //observedCounts[uj] += confidencePoisson(observedCounts[uj],alpha);
279  }
280 
281  //Should have found a peak.
282  ASSERT(closestPeak !=(unsigned int)-1);
283 
284 
285  //rescale our theoretical distribution to the real one
286  float scaleFactor;
287  scaleFactor=observedCounts[closestPeak]/massDist[closestPeak].second;
288 
289 
290  vector<float> scaledSolution;
291  scaledSolution.resize(massDist.size());
292  for(unsigned int uj=0;uj<massDist.size();uj++)
293  scaledSolution[uj] = massDist[uj].second*scaleFactor;
294 
295  //Find the greatest difference between the theoretical and the observed
296  // This comes from the peak which has the biggest ratio between observed and
297  // our scaled-down isotopic distribution
298  //=================
299  float minTheoreticalRatio=std::numeric_limits<float>::max();
300  const unsigned int OBSERVED_COUNT_THRESHOLD = 10;
301  const unsigned int MASS_DIST_MIN_THRESHOLD= 10;
302  for(unsigned int uj=0;uj<scaledSolution.size();uj++)
303  {
304  //If the ratio is likely to be unstable, then don't compute this
305  if(observedCounts[uj] < OBSERVED_COUNT_THRESHOLD &&
306  scaledSolution[uj] < MASS_DIST_MIN_THRESHOLD)
307  continue;
308 
309  float theoreticalRatio;
310  //Theoretical ratio is the ratio of observed to theoretical
311  // counts
312  theoreticalRatio=observedCounts[uj]/scaledSolution[uj];
313  minTheoreticalRatio=std::min(minTheoreticalRatio,theoreticalRatio);
314  }
315 
316  //Scale down the solutions by the limiting ratio
317  for(unsigned int uj=0;uj<scaledSolution.size();uj++)
318  scaledSolution[uj]*=minTheoreticalRatio;
319 
320  if(minTheoreticalRatio!=std::numeric_limits<float>::max())
321  {
322  //Find the index of the peak we have selected
323  // for identification
324  //FIXME: This should not simply snap the closest.
325  float minDelta=std::numeric_limits<float>::max();
326  unsigned int minIdx=(unsigned int)-1;
327  for(unsigned int uj=0;uj<massDist.size();uj++)
328  {
329  float delta;
330  delta=fabs(massDist[uj].first-peakMass);
331  if(delta < minDelta)
332  {
333  minDelta=delta;
334  minIdx=uj;
335  }
336  }
337 
338  if(minIdx != (unsigned int)-1)
339  {
340  explainedFractions[ui]=scaledSolution[minIdx]/observedCounts[minIdx];
341  ASSERT(explainedFractions[ui] >=0.0f && explainedFractions[ui] <=1.1f);
342  }
343  else
344  explainedFractions[ui]=-1.0f;
345 
346  }
347  else
348  explainedFractions[ui]=-1.0f;
349  //=================
350  }
351  return explainedFractions;
352 
353 }
354 
355 
356 #ifdef DEBUG
357 #include <iostream>
358 #include <cstdlib>
359 
360 using std::cerr;
361 using std::endl;
362 
363 bool testMaxExplainedFraction()
364 {
365  AbundanceData massTable;
366  if(massTable.open("naturalAbundance.xml"))
367  {
368  cerr << "WARN : Error opening abundance table, skipping" << endl;
369  return true;
370  }
371 
372  //Simple test. A single ion should be 100% explained by the
373  // matching isotope
375  {
376  vector<pair<float,float> > massData;
377 
378  ISOTOPE_ENTRY mnWeight;
379 
380  unsigned int elemIdx,isotopeIdx;
381  elemIdx= massTable.symbolIndex("Mn");
382  isotopeIdx=massTable.getMajorIsotopeFromElemIdx(elemIdx);
383  mnWeight= massTable.isotope(elemIdx,isotopeIdx);
384 
385 
386  //Create some simulated mass data
387  const unsigned int NUM_PTS =100;
388  massData.resize(1);
389  massData[0].first=mnWeight.mass;
390  massData[0].second=NUM_PTS;
391 
392  //OK, so we've set the weight. Lets create a "solution" to the mass problem.
393  // this is simply a vector of weights comprising the species that make up
394  // the combined result
395  vector<ISOTOPE_ENTRY> soln;
396  soln.push_back(mnWeight);
397 
398  //Package this up for the function
399  vector<vector<ISOTOPE_ENTRY> > solutions;
400  solutions.push_back(soln);
401 
402  vector<float> explainedFractions;
403  explainedFractions = maxExplainedFraction(massData, mnWeight.mass , 0.15, solutions,massTable,0.01,1);
404 
405  ASSERT(explainedFractions.size() ==1);
406 
407  float delta = fabs(explainedFractions[0] -1.0f);
408 
409  //FIXME: This is a little lax?
410  TEST(delta <0.05,"Mn isotope test");
411  }
412 
413  {
414  //More complex test case. Lets have a Ga overlap with some
415  // unknown species, such that the observed ratios are 1:1
416  // In this case, the first peak should be fully explained by Ga,
417  // but the second peak should not be
418 
419  vector<pair<float,float> > massData;
420 
421  ISOTOPE_ENTRY gaWeight[2];
422 
423  unsigned int gaIdx;
424 
425  gaIdx= massTable.symbolIndex("Ga");
426 
427  gaWeight[0]=massTable.isotope(gaIdx,0);
428  gaWeight[1]=massTable.isotope(gaIdx,1);
429 
430  //Create some simulated mass data, in a 1:1 ratio
431  const unsigned int NUM_PTS =6010;
432  massData.resize(2);
433  massData[0].first = gaWeight[0].mass;
434  massData[1].first = gaWeight[1].mass;
435  massData[0].second = NUM_PTS;
436  massData[1].second = NUM_PTS;
437 
438 
439  vector<ISOTOPE_ENTRY> soln;
440  soln.push_back(gaWeight[0]);
441 
442  //Package this up for the function
443  vector<vector<ISOTOPE_ENTRY> > solutions;
444  solutions.push_back(soln);
445 
446  {
447  vector<float> explainedFractions;
448  explainedFractions = maxExplainedFraction(massData, gaWeight[0].mass ,
449  0.15, solutions,massTable,0.01, 1);
450 
451  ASSERT(explainedFractions.size() ==1);
452 
453  float delta = fabs(explainedFractions[0] -1.0f);
454 
455  //FIXME: This is a little lax?
456  TEST(delta <0.05,"Ga large isotope test");
457  }
458 
459  //Repeat for other peak
460  {
461  vector<float> explainedFractions;
462  explainedFractions = maxExplainedFraction(massData, gaWeight[1].mass ,
463  0.15, solutions,massTable,0.01,1);
464 
465  ASSERT(explainedFractions.size() ==1);
466 
467  //Should be ~70% explained (due to isotopic abundance - Ga only can make up 70.925% of the observed peak count)
468  float delta = fabs(explainedFractions[0] -0.70925f);
469 
470  //FIXME: This is a little lax?
471  TEST(delta <0.05,"Ga small isotope test");
472  }
473  }
474 
475  {
476  //More complex test case again.
477  // Overlap Cl with AlB at the 1+ charge state in ratio 1:1.
478  // Check the 37 overlap
479 
480  vector<pair<float,float> > massData;
481 
482  ISOTOPE_ENTRY clWeight[2], alWeight, bWeight;
483 
484  unsigned int elemIdx;
485  elemIdx=massTable.symbolIndex("Cl");
486  clWeight[0]=massTable.isotope(elemIdx,0);
487  clWeight[1]=massTable.isotope(elemIdx,1);
488 
489  //Obtain the weights for 27^Al and 10^B
490  // combined these have mass ~37 amu
491  elemIdx=massTable.symbolIndex("Al");
492  alWeight=massTable.isotope(elemIdx,0);
493 
494  elemIdx=massTable.symbolIndex("B");
495  bWeight=massTable.isotope(elemIdx,0);
496 
497  //Create some simulated mass data, in a 1:1 ratio
498  // Natural abundances for Cl : 35^Cl -> 75.77%, 37^Cl -> 24.23%
499  const unsigned int NUM_35_CLPTS =7577;
500  const unsigned int NUM_37_CLPTS =2423;
501  massData.resize(2);
502  massData[0].first = clWeight[0].mass;
503  massData[1].first = clWeight[1].mass;
504  massData[0].second = NUM_35_CLPTS;
505  massData[1].second = NUM_37_CLPTS;
506 
507  //So we are going to ask the question,
508  // what is the explain fraction of the peak at 37 being Cl, rather than AlB
509  // Let us thus propose solutions for the 37 peak
510 
511  vector<ISOTOPE_ENTRY> solnCl, solnAlB;
512  solnCl.push_back(clWeight[1]);
513 
514  solnAlB.push_back(alWeight);
515  solnAlB.push_back(bWeight);
516 
517  //Package this up for the function
518  vector<vector<ISOTOPE_ENTRY> > solutions;
519  solutions.push_back(solnCl);
520  solutions.push_back(solnAlB);
521 
522  {
523  //Compute the explained fraction of the two solutions
524  vector<float> explainedFractions;
525  float meanMass = 0.5*(clWeight[1].mass + (alWeight.mass + bWeight.mass));
526  explainedFractions = maxExplainedFraction(massData, meanMass,
527  0.15, solutions,massTable,0.01,1);
528 
529  ASSERT(explainedFractions.size() ==2);
530 
531 
532  //FIXME: This is a little lax?
533  TEST(explainedFractions[0] > 0.95,"Cl fully explains 37 peak");
534  TEST(explainedFractions[1] < 0.05,"AlB does not explain 37 peak");
535  }
536  }
537  return true;
538 }
539 
540 bool isotopeFilterTests()
541 {
542  return testMaxExplainedFraction();
543 }
544 
545 #endif
546 
547 }
size_t symbolIndex(const char *symbol, bool caseSensitive=true) const
Return the element&#39;s position in table, starting from 0.
Definition: abundance.cpp:214
void buildFrequencyTable(const vector< ISOTOPE_ENTRY > &solutionVec, vector< size_t > &solutionElements, vector< size_t > &solutionFrequency)
Definition: filter.cpp:50
Definition: abundance.h:42
void generateIsotopeDist(const std::vector< size_t > &elementIdx, const std::vector< size_t > &frequency, std::vector< std::pair< float, float > > &massDist) const
Compute the mass-probability distribution for the grouped set of ions.
Definition: abundance.cpp:277
void vectorMultiErase(std::vector< T > &vec, const std::vector< bool > &wantKill)
Remove elements from the vector, without preserving order.
Definition: misc.h:112
void filterPeakNeedBiggerObs(const AtomProbe::AbundanceData &massTable, const std::vector< float > &peakData, float tolerance, size_t solutionCharge, std::vector< std::vector< AtomProbe::ISOTOPE_ENTRY > > &solutions)
Definition: filter.cpp:119
#define TEST(f, g)
Definition: aptAssert.h:49
void generateGroupedIsotopeDist(const std::vector< size_t > &elementIdx, const std::vector< size_t > &frequency, std::vector< std::pair< float, float > > &massDist, float massTolerance) const
As per generateIsotopeDist, however, this convenience groups the distribution to limit the effect of ...
Definition: abundance.cpp:395
void filterBySolutionPPM(const AtomProbe::AbundanceData &massTable, float minPpm, std::vector< std::vector< AtomProbe::ISOTOPE_ENTRY > > &solutions)
Use the maximum possible PPM for each isotopic combination to filter possible solutions.
Definition: filter.cpp:83
size_t open(const char *file, bool strict=false)
Attempt to open the abundance data file, return 0 on success, nonzero on failure. ...
Definition: abundance.cpp:79
float abundance
Definition: abundance.h:49
Class to load abundance information for natural isotopes.
Definition: abundance.h:54
std::vector< float > maxExplainedFraction(const std::vector< std::pair< float, float > > &massData, float peakMass, float massWidth, const std::vector< std::vector< AtomProbe::ISOTOPE_ENTRY > > &solutions, const AtomProbe::AbundanceData &massTable, float massDistTol, unsigned int solutionCharge)
Compute the fraction of the data that has been explained, using the natural abundance information...
size_t getMajorIsotopeFromElemIdx(size_t elementIdx) const
Obtain the most prominent isotope&#39;s index from the element index.
Definition: abundance.cpp:482
#define ASSERT(f)
float mass
Definition: abundance.h:47
const ISOTOPE_ENTRY & isotope(size_t elementIdx, size_t isotopeIdx) const
Obtain a reference to a particular isotope, using the element&#39;s index in the table, and the isotope index.
Definition: abundance.cpp:271
unsigned int countIntensityEvents(const vector< pair< float, float > > &data, float minV, float maxV)
Definition: filter.cpp:36