manuals/libatomprobe/filter_8cpp_source.html

 /* filter.cpp :  Mass spectrum candidate filtering
  * Copyright (C) 2020  Daniel Haley
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #include "atomprobe/algorithm/filter.h"

 #include "atomprobe/helper/aptAssert.h"
 #include "atomprobe/helper/misc.h"

 #include <limits>
 #include <cmath>
 #include <utility>


 namespace AtomProbe{

 using std::pair;
 using std::vector;
 using std::string;
 using std::map;


 unsigned int countIntensityEvents(const vector<pair<float,float> > &data, float minV, float maxV)
 {
     unsigned int n=0;
 #pragma omp parallel for reduction(+:n)
     for(unsigned int ui=0;ui<data.size();ui++)
     {
         if(data[ui].first >=minV && data[ui].first < maxV)
             n+=data[ui].second;
     }

     return n;
 }


 void buildFrequencyTable(const vector<ISOTOPE_ENTRY> &solutionVec,
         vector<size_t> &solutionElements, vector<size_t> &solutionFrequency)
 {
     ASSERT(solutionVec.size());

     //Count number of times we have seen each element
     map<size_t,size_t> solutionCount;
     solutionCount.clear();
     for(const auto & soln : solutionVec)
     {
         map<size_t,size_t>::iterator it;
         it=solutionCount.find(soln.atomicNumber-1);

         //Check to see if we have encountered this
         //before, if so accumulate our count
         if(it==solutionCount.end())
             solutionCount[soln.atomicNumber-1] =1;
         else
             it->second++;
     }

     //erase the solution elements, as we have now counted duplicates
     // and restructure so we have [ element, frequency ] in the two vectors
     solutionFrequency.clear(); solutionElements.clear();
     for(map<size_t,size_t>::const_iterator it = solutionCount.begin(); it!=solutionCount.end(); ++it)
     {
         solutionElements.push_back(it->first);
         solutionFrequency.push_back(it->second);
     }
 }


 //FIXME: Algorithm is a little out - we need to sum abundances within a small tolerance window, rather than match exactly to isotope abundance
 void filterBySolutionPPM(const AbundanceData &massTable, float minPpm, vector<vector<ISOTOPE_ENTRY> > &solutions)
 {
     vector<float> solutionProbabilities;
     solutionProbabilities.resize(solutions.size());

     vector<bool> killVec;
     killVec.resize(solutions.size(),false);

     for(size_t ui=0;ui<solutions.size();ui++)
     {
         //Compute the effective mass fraction of this isotope,
         // assuming the entire material is only made up of this combination
         float solutionComp;
         solutionComp=1.0f;
         for(auto & soln : solutions[ui])
         {
             //Get the isotope and element ID for this mass
             pair<size_t,size_t> tmp;
             size_t elementIdx,isotopeIdx;
             elementIdx= soln.massNumber-1;
             isotopeIdx= soln.atomicNumber-1;

             solutionComp*=massTable.isotope(elementIdx,
                             isotopeIdx).abundance;
         }

         //Decide if the solution is to be kept or not,
         // based upon the theoretical ppm, and desired
         killVec[ui] = solutionComp*1e6 < minPpm;
     }

     //Trim the solutions
     vectorMultiErase(solutions,killVec);
 }


 void filterPeakNeedBiggerObs(const AbundanceData &massTable,
                 const vector<float > &peakPosition, float tolerance,
                 size_t solutionCharge,vector<vector<ISOTOPE_ENTRY> > &solutions)
 {


     //We need to scan the solutions, look at each peak and work out
     // if there are any larger peaks in the list of possible solution
     vector<bool> cullSolutions;
     cullSolutions.resize(solutions.size(),false);

     for(size_t ui=0;ui<solutions.size();ui++)
     {

         vector<pair<float,float> > massDist;
         //Compute the mass distribution from the element mass chain
         //==
         {

         //Convert the list of masses into a [ element, freq] listing
         vector<size_t> solutionElements;
         vector<size_t> solutionFreq;
         buildFrequencyTable(solutions[ui],
             solutionElements,solutionFreq);


         //Compute theoretical peak intensity distribution
         //--
         massTable.generateIsotopeDist(solutionElements,
                 solutionFreq,massDist);

         for(unsigned int uj=0;uj<massDist.size();uj++)
             massDist[uj].first/=solutionCharge;
         //--
         }
         //==

         //Eliminate any peaks which in have smaller amplitude
         // from the distribution
         //--
         //a ) Find our solution's total mass,
         float solutionMass;
         solutionMass=0;
         for(size_t uj=0;uj<solutions[ui].size();uj++)
             solutionMass+=solutions[ui][uj].mass;

         //b) eliminate.First find the if there is an in-range
         float solutionNaturalAmp=-1;
         for(size_t uj=0; uj<massDist.size();uj++)
         {
             if(fabs(massDist[uj].first - solutionMass) < sqrt(std::numeric_limits<float>::epsilon()))
             {
                 solutionNaturalAmp=massDist[uj].second;
                 break;

             }
         }

         ASSERT(solutionNaturalAmp>=0.0f);

         vector<bool> killVec;
         killVec.resize(massDist.size());
         for(size_t uj=0;uj<massDist.size();uj++)
             killVec[uj] = (massDist[uj].second <= solutionNaturalAmp);

         vectorMultiErase(massDist,killVec);
         //--

         //Now check to make sure that *all* peaks above our
         // solution size exist, marking solutions to be culled
         // if not all larger peaks can be found in the input
         for(size_t uj=0;uj<massDist.size();uj++)
         {
             bool found;
             found=false;
             for(size_t uk=0; uk<peakPosition.size();uk++)
             {
                 if(fabs(peakPosition[uk]-massDist[uj].first) < tolerance)
                 {
                     found=true;
                     break;
                 }
             }

             //If a larger peak could not be found
             // in the observed distribution,
             // then this solution is invalid
             if(!found)
             {
                 cullSolutions[ui]=true;
                 break;
             }
         }


     }

     //Cull the solution vector
     vectorMultiErase(solutions,cullSolutions);

 }

 vector<float> maxExplainedFraction(const vector<pair<float,float> > &intensityData, float peakMass, float massWidth,
     const vector<vector<ISOTOPE_ENTRY> > &solutions, const AbundanceData &massTable,
     float massDistTol, unsigned int solutionCharge)
 {

     //Step 1: Find the solution that we are interested in.
     vector<pair<float,float> > massDist;

     vector<float> explainedFractions;
     explainedFractions.resize(solutions.size());

     for(size_t ui=0;ui<solutions.size();ui++)
     {

         //Convert the list of masses into a [ element, freq] listing
         vector<size_t> solutionElements;
         vector<size_t> solutionFreq;
         buildFrequencyTable(solutions[ui],
             solutionElements,solutionFreq);


         //Compute theoretical peak intensity distribution.
         // use the grouped version, to clump similar massed
         // isotopes together.
         //--
         massTable.generateGroupedIsotopeDist(solutionElements,
                 solutionFreq,massDist,massDistTol);

         //Correct for solution charge.
         for(unsigned int uj=0;uj<massDist.size();uj++)
             massDist[uj].first/=solutionCharge;
         //--

         //For each range, we need to compute the number of counts in that range

         unsigned int closestPeak;
         float massErr;
         massErr = std::numeric_limits<float>::max();
         closestPeak=(unsigned int)-1;

         vector<float> observedCounts;
         observedCounts.resize(massDist.size());
         for(unsigned int uj=0;uj<massDist.size();uj++)
         {
             observedCounts[uj] =countIntensityEvents(intensityData,massDist[uj].first-massWidth, massDist[uj].first+massWidth);

             //Find the best-matching peak from the theoretical distribution
             // to our target peak
             float massErrTmp;
             massErrTmp = fabs(massDist[uj].first - peakMass);
             if(massErrTmp < massErr)
             {
                 massErr = massErrTmp;
                 closestPeak=uj;
             }

             //FIXME: Add some extra counts in order to acommodate our confidence level
             //observedCounts[uj] += confidencePoisson(observedCounts[uj],alpha);
         }

         //Should have found a peak.
         ASSERT(closestPeak !=(unsigned int)-1);


         //rescale our theoretical distribution to the real one
         float scaleFactor;
         scaleFactor=observedCounts[closestPeak]/massDist[closestPeak].second;


         vector<float> scaledSolution;
         scaledSolution.resize(massDist.size());
         for(unsigned int uj=0;uj<massDist.size();uj++)
             scaledSolution[uj] = massDist[uj].second*scaleFactor;

         //Find the greatest difference between the theoretical and the observed
         // This comes from the peak which has the biggest ratio between observed and
         // our scaled-down isotopic distribution
         //=================
         float minTheoreticalRatio=std::numeric_limits<float>::max();
         const unsigned int OBSERVED_COUNT_THRESHOLD = 10;
         const unsigned int MASS_DIST_MIN_THRESHOLD= 10;
         for(unsigned int uj=0;uj<scaledSolution.size();uj++)
         {
             //If the ratio is likely to be unstable, then don't compute this
             if(observedCounts[uj] < OBSERVED_COUNT_THRESHOLD &&
                 scaledSolution[uj] < MASS_DIST_MIN_THRESHOLD)
                 continue;

             float theoreticalRatio;
             //Theoretical ratio is the ratio of observed to theoretical
             // counts
             theoreticalRatio=observedCounts[uj]/scaledSolution[uj];
             minTheoreticalRatio=std::min(minTheoreticalRatio,theoreticalRatio);
         }

         //Scale down the solutions by the limiting ratio
         for(unsigned int uj=0;uj<scaledSolution.size();uj++)
             scaledSolution[uj]*=minTheoreticalRatio;

         if(minTheoreticalRatio!=std::numeric_limits<float>::max())
         {
             //Find the index of the peak we have selected
             // for identification
             //FIXME: This should not simply snap the closest.
             float minDelta=std::numeric_limits<float>::max();
             unsigned int minIdx=(unsigned int)-1;
             for(unsigned int uj=0;uj<massDist.size();uj++)
             {
                 float delta;
                 delta=fabs(massDist[uj].first-peakMass);
                 if(delta < minDelta)
                 {
                     minDelta=delta;
                     minIdx=uj;
                 }
             }

             if(minIdx != (unsigned int)-1)
             {
                 explainedFractions[ui]=scaledSolution[minIdx]/observedCounts[minIdx];
                 ASSERT(explainedFractions[ui] >=0.0f && explainedFractions[ui] <=1.1f);
             }
             else
                 explainedFractions[ui]=-1.0f;

         }
         else
             explainedFractions[ui]=-1.0f;
         //=================
     }
     return explainedFractions;

 }


 #ifdef DEBUG
 #include <iostream>
 #include <cstdlib>

 using std::cerr;
 using std::endl;

 bool testMaxExplainedFraction()
 {
     AbundanceData massTable;
     if(massTable.open("naturalAbundance.xml"))
     {
         cerr << "WARN : Error opening abundance table, skipping" << endl;
         return true;
     }

     //Simple test. A single ion should be 100% explained by the
     // matching isotope
     {
     vector<pair<float,float> > massData;

     ISOTOPE_ENTRY mnWeight;

     unsigned int elemIdx,isotopeIdx;
     elemIdx= massTable.symbolIndex("Mn");
     isotopeIdx=massTable.getMajorIsotopeFromElemIdx(elemIdx);
     mnWeight= massTable.isotope(elemIdx,isotopeIdx);


     //Create some simulated mass data
     const unsigned int NUM_PTS =100;
     massData.resize(1);
     massData[0].first=mnWeight.mass;
     massData[0].second=NUM_PTS;

     //OK, so we've set the weight. Lets create a "solution" to the mass problem.
     // this is simply a vector of weights comprising the species that make up
     // the combined result
     vector<ISOTOPE_ENTRY> soln;
     soln.push_back(mnWeight);

     //Package this up for the function
     vector<vector<ISOTOPE_ENTRY> > solutions;
     solutions.push_back(soln);

     vector<float> explainedFractions;
     explainedFractions = maxExplainedFraction(massData, mnWeight.mass , 0.15, solutions,massTable,0.01,1);

     ASSERT(explainedFractions.size() ==1);

     float delta = fabs(explainedFractions[0] -1.0f);

     //FIXME: This is a little lax?
     TEST(delta <0.05,"Mn isotope test");
     }

     {
         //More complex test case. Lets have a Ga overlap with some
         // unknown species, such that the observed ratios are 1:1
         // In this case, the first peak should be fully explained by Ga,
         // but the second peak should not be

         vector<pair<float,float> > massData;

         ISOTOPE_ENTRY gaWeight[2];

         unsigned int gaIdx;

         gaIdx= massTable.symbolIndex("Ga");

         gaWeight[0]=massTable.isotope(gaIdx,0);
         gaWeight[1]=massTable.isotope(gaIdx,1);

         //Create some simulated mass data, in a 1:1 ratio
         const unsigned int NUM_PTS =6010;
         massData.resize(2);
         massData[0].first = gaWeight[0].mass;
         massData[1].first = gaWeight[1].mass;
         massData[0].second = NUM_PTS;
         massData[1].second = NUM_PTS;


         vector<ISOTOPE_ENTRY> soln;
         soln.push_back(gaWeight[0]);

         //Package this up for the function
         vector<vector<ISOTOPE_ENTRY> > solutions;
         solutions.push_back(soln);

         {
         vector<float> explainedFractions;
         explainedFractions = maxExplainedFraction(massData, gaWeight[0].mass ,
                                 0.15, solutions,massTable,0.01, 1);

         ASSERT(explainedFractions.size() ==1);

         float delta = fabs(explainedFractions[0] -1.0f);

         //FIXME: This is a little lax?
         TEST(delta <0.05,"Ga large isotope test");
         }

         //Repeat for other peak
         {
         vector<float> explainedFractions;
         explainedFractions = maxExplainedFraction(massData, gaWeight[1].mass ,
                                 0.15, solutions,massTable,0.01,1);

         ASSERT(explainedFractions.size() ==1);

         //Should be ~70% explained (due to isotopic abundance - Ga only can make up 70.925% of the observed peak count)
         float delta = fabs(explainedFractions[0] -0.70925f);

         //FIXME: This is a little lax?
         TEST(delta <0.05,"Ga small isotope test");
         }
     }

     {
         //More complex test case again.
         // Overlap Cl with AlB at the 1+ charge state in ratio 1:1.
         // Check the 37 overlap

         vector<pair<float,float> > massData;

         ISOTOPE_ENTRY clWeight[2], alWeight, bWeight;

         unsigned int elemIdx;
         elemIdx=massTable.symbolIndex("Cl");
         clWeight[0]=massTable.isotope(elemIdx,0);
         clWeight[1]=massTable.isotope(elemIdx,1);

         //Obtain the weights for 27^Al and 10^B
         // combined these have mass ~37 amu
         elemIdx=massTable.symbolIndex("Al");
         alWeight=massTable.isotope(elemIdx,0);

         elemIdx=massTable.symbolIndex("B");
         bWeight=massTable.isotope(elemIdx,0);

         //Create some simulated mass data, in a 1:1 ratio
         // Natural abundances for Cl : 35^Cl -> 75.77%, 37^Cl -> 24.23%
         const unsigned int NUM_35_CLPTS =7577;
         const unsigned int NUM_37_CLPTS =2423;
         massData.resize(2);
         massData[0].first = clWeight[0].mass;
         massData[1].first = clWeight[1].mass;
         massData[0].second = NUM_35_CLPTS;
         massData[1].second = NUM_37_CLPTS;

         //So we are going to ask the question,
         // what is the explain fraction of the peak at 37 being Cl, rather than AlB
         // Let us thus propose solutions for the 37 peak

         vector<ISOTOPE_ENTRY> solnCl, solnAlB;
         solnCl.push_back(clWeight[1]);

         solnAlB.push_back(alWeight);
         solnAlB.push_back(bWeight);

         //Package this up for the function
         vector<vector<ISOTOPE_ENTRY> > solutions;
         solutions.push_back(solnCl);
         solutions.push_back(solnAlB);

         {
         //Compute the explained fraction of the two solutions
         vector<float> explainedFractions;
         float meanMass = 0.5*(clWeight[1].mass + (alWeight.mass + bWeight.mass));
         explainedFractions = maxExplainedFraction(massData, meanMass,
                             0.15, solutions,massTable,0.01,1);

         ASSERT(explainedFractions.size() ==2);


         //FIXME: This is a little lax?
         TEST(explainedFractions[0] > 0.95,"Cl fully explains 37 peak");
         TEST(explainedFractions[1] < 0.05,"AlB does not explain 37 peak");
         }
     }
     return true;
 }

 bool isotopeFilterTests()
 {
     return testMaxExplainedFraction();
 }

 #endif

 }
AtomProbe::AbundanceData::symbolIndex
size_t symbolIndex(const char *symbol, bool caseSensitive=true) const
Return the element&#39;s position in table, starting from 0.
Definition: abundance.cpp:214

AtomProbe::buildFrequencyTable
void buildFrequencyTable(const vector< ISOTOPE_ENTRY > &solutionVec, vector< size_t > &solutionElements, vector< size_t > &solutionFrequency)
Definition: filter.cpp:50

misc.h

filter.h

AtomProbe::ISOTOPE_ENTRY
Definition: abundance.h:42

AtomProbe::AbundanceData::generateIsotopeDist
void generateIsotopeDist(const std::vector< size_t > &elementIdx, const std::vector< size_t > &frequency, std::vector< std::pair< float, float > > &massDist) const
Compute the mass-probability distribution for the grouped set of ions.
Definition: abundance.cpp:277

AtomProbe::vectorMultiErase
void vectorMultiErase(std::vector< T > &vec, const std::vector< bool > &wantKill)
Remove elements from the vector, without preserving order.
Definition: misc.h:112

AtomProbe::filterPeakNeedBiggerObs
void filterPeakNeedBiggerObs(const AtomProbe::AbundanceData &massTable, const std::vector< float > &peakData, float tolerance, size_t solutionCharge, std::vector< std::vector< AtomProbe::ISOTOPE_ENTRY > > &solutions)
Definition: filter.cpp:119

TEST
#define TEST(f, g)
Definition: aptAssert.h:49

AtomProbe::AbundanceData::generateGroupedIsotopeDist
void generateGroupedIsotopeDist(const std::vector< size_t > &elementIdx, const std::vector< size_t > &frequency, std::vector< std::pair< float, float > > &massDist, float massTolerance) const
As per generateIsotopeDist, however, this convenience groups the distribution to limit the effect of ...
Definition: abundance.cpp:395

AtomProbe::filterBySolutionPPM
void filterBySolutionPPM(const AtomProbe::AbundanceData &massTable, float minPpm, std::vector< std::vector< AtomProbe::ISOTOPE_ENTRY > > &solutions)
Use the maximum possible PPM for each isotopic combination to filter possible solutions.
Definition: filter.cpp:83

AtomProbe
Definition: axialdf.h:24

AtomProbe::AbundanceData::open
size_t open(const char *file, bool strict=false)
Attempt to open the abundance data file, return 0 on success, nonzero on failure. ...
Definition: abundance.cpp:79

AtomProbe::ISOTOPE_ENTRY::abundance
float abundance
Definition: abundance.h:49

AtomProbe::AbundanceData
Class to load abundance information for natural isotopes.
Definition: abundance.h:54

AtomProbe::maxExplainedFraction
std::vector< float > maxExplainedFraction(const std::vector< std::pair< float, float > > &massData, float peakMass, float massWidth, const std::vector< std::vector< AtomProbe::ISOTOPE_ENTRY > > &solutions, const AtomProbe::AbundanceData &massTable, float massDistTol, unsigned int solutionCharge)
Compute the fraction of the data that has been explained, using the natural abundance information...

aptAssert.h

AtomProbe::AbundanceData::getMajorIsotopeFromElemIdx
size_t getMajorIsotopeFromElemIdx(size_t elementIdx) const
Obtain the most prominent isotope&#39;s index from the element index.
Definition: abundance.cpp:482

ASSERT
#define ASSERT(f)
Definition: zechBackground.cpp:26

AtomProbe::ISOTOPE_ENTRY::mass
float mass
Definition: abundance.h:47

AtomProbe::AbundanceData::isotope
const ISOTOPE_ENTRY & isotope(size_t elementIdx, size_t isotopeIdx) const
Obtain a reference to a particular isotope, using the element&#39;s index in the table, and the isotope index.
Definition: abundance.cpp:271

AtomProbe::countIntensityEvents
unsigned int countIntensityEvents(const vector< pair< float, float > > &data, float minV, float maxV)
Definition: filter.cpp:36