OpenMS  2.4.0
SimpleSVM.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2018.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Hendrik Weisser $
32 // $Authors: Hendrik Weisser $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
38 
39 #include <svm.h>
40 
41 #include <map>
42 #include <vector>
43 #include <utility> // for "pair"
44 
45 namespace OpenMS
46 {
65  class OPENMS_DLLAPI SimpleSVM :
66  public DefaultParamHandler
67  {
68 
69  public:
71  typedef std::map<String, std::vector<double> > PredictorMap;
72 
74  struct Prediction
75  {
78 
80  std::map<Int, double> probabilities;
81  };
82 
84  SimpleSVM();
85 
87  ~SimpleSVM() override;
88 
99  void setup(PredictorMap& predictors, const std::map<Size, Int>& labels);
100 
110  void predict(std::vector<Prediction>& predictions,
111  std::vector<Size> indexes = std::vector<Size>()) const;
112 
121  void getFeatureWeights(std::map<String, double>& feature_weights) const;
122 
124  void writeXvalResults(const String& path) const;
125 
126  protected:
128  typedef std::vector<std::vector<double> > SVMPerformance;
129 
131  std::vector<std::vector<struct svm_node> > nodes_;
132 
134  struct svm_problem data_;
135 
137  struct svm_parameter svm_params_;
138 
140  struct svm_model* model_;
141 
143  std::vector<String> predictor_names_;
144 
147 
149  std::vector<double> log2_C_, log2_gamma_;
150 
153 
155  static void printNull_(const char*) {}
156 
158  void scaleData_(PredictorMap& predictors) const;
159 
161  void convertData_(const PredictorMap& predictors);
162 
164  std::pair<double, double> chooseBestParameters_() const;
165 
167  void optimizeParameters_();
168  };
169 }
170 
OpenMS::SimpleSVM::SVMPerformance
std::vector< std::vector< double > > SVMPerformance
Classification performance for different param. combinations (C/gamma):
Definition: SimpleSVM.h:128
DefaultParamHandler.h
OpenMS::ProteinIdentification::SearchParameters::digestion_enzyme
Protease digestion_enzyme
The cleavage site information in details (from ProteaseDB)
Definition: ProteinIdentification.h:118
OpenMS::ProgressLogger::setProgress
void setProgress(SignedSize value) const
Sets the current progress.
OpenMS::TOPPBase
Base class for TOPP applications.
Definition: TOPPBase.h:150
OpenMS::Normalizer
Normalizes the peak intensities spectrum-wise.
Definition: Normalizer.h:57
OpenMS::PeptideIndexing::PEPTIDE_IDS_EMPTY
Definition: PeptideIndexing.h:133
OpenMS::ProteaseDigestion::setEnzyme
void setEnzyme(const String &name)
Sets the enzyme for the digestion (by name)
OpenMS::SimpleSVM::performance_
SVMPerformance performance_
Cross-validation results.
Definition: SimpleSVM.h:152
OpenMS::SimpleSVM::Prediction::probabilities
std::map< Int, double > probabilities
Predicted probabilities for different classes.
Definition: SimpleSVM.h:80
OpenMS::DigestionEnzymeDB< DigestionEnzymeProtein, ProteaseDB >::getInstance
static ProteaseDB * getInstance()
this member function serves as a replacement of the constructor
Definition: DigestionEnzymeDB.h:70
Int
OpenMS::ProteinIdentification::PeakMassType
PeakMassType
Peak mass type.
Definition: ProteinIdentification.h:93
OpenMS::MSExperiment::sortSpectra
void sortSpectra(bool sort_mz=true)
Sorts the data points by retention time.
OpenMS::FASTAFile
This class serves for reading in and writing FASTA files.
Definition: FASTAFile.h:64
OpenMS::WindowMower
WindowMower augments the highest peaks in a sliding or jumping window.
Definition: WindowMower.h:54
OpenMS::SimpleSVM
Simple interface to support vector machines for classification (via LIBSVM).
Definition: SimpleSVM.h:65
NUMBER_OF_THREADS
#define NUMBER_OF_THREADS
Definition: SimpleSearchEngine.cpp:74
OpenMS::Param::setValue
void setValue(const String &key, const DataValue &value, const String &description="", const StringList &tags=StringList())
Sets a value.
OpenMS::ProteinIdentification::SearchParameters::missed_cleavages
UInt missed_cleavages
The number of allowed missed cleavages.
Definition: ProteinIdentification.h:113
OpenMS::MzMLFile
File adapter for MzML files.
Definition: MzMLFile.h:55
OpenMS::String
A more convenient string class.
Definition: String.h:57
OpenMS::MSExperiment::begin
Iterator begin()
Definition: MSExperiment.h:157
OpenMS::ModificationsDB::getInstance
static ModificationsDB * getInstance(OpenMS::String unimod_file="CHEMISTRY/unimod.xml", OpenMS::String psimod_file="CHEMISTRY/PSI-MOD.obo", OpenMS::String xlmod_file="CHEMISTRY/XLMOD.obo")
Returns a pointer to the modifications DB (singleton)
Definition: ModificationsDB.h:77
Peak1D.h
OpenMS::WindowMower::filterPeakSpectrum
void filterPeakSpectrum(PeakSpectrum &spectrum)
MzMLFile.h
OpenMS::PeptideIdentification::setRT
void setRT(double rt)
sets the RT of the MS2 spectrum where the identification occurred
OpenMS::MSExperiment
In-Memory representation of a mass spectrometry experiment.
Definition: MSExperiment.h:77
SimpleSearchEngine
Definition: SimpleSearchEngine.cpp:81
OpenMS::DigestionEnzymeDB::getAllNames
void getAllNames(std::vector< String > &all_names) const
returns all the enzyme names (does NOT include synonym names)
Definition: DigestionEnzymeDB.h:122
OpenMS::PeakFileOptions::addMSLevel
void addMSLevel(int level)
adds a desired MS level for peaks to load
OpenMS::Size
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
OpenMS::AASequence::getMonoWeight
double getMonoWeight(Residue::ResidueType type=Residue::Full, Int charge=0) const
IdXMLFile.h
SimpleSearchEngine::AnnotatedHit::peptide_mod_index
SignedSize peptide_mod_index
Definition: SimpleSearchEngine.cpp:88
OpenMS::Constants::c
const double c
HyperScore.h
LOG_INFO
#define LOG_INFO
Macro if a information, e.g. a status should be reported.
Definition: LogStream.h:454
OpenMS::ProteinIdentification::MONOISOTOPIC
Definition: ProteinIdentification.h:95
OpenMS::ProgressLogger::startProgress
void startProgress(SignedSize begin, SignedSize end, const String &label) const
Initializes the progress display.
OpenMS::PeptideIndexing::run
ExitCodes run(std::vector< FASTAFile::FASTAEntry > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
forward for old interface and pyOpenMS; use run<T>() for more control
Definition: PeptideIndexing.h:147
OpenMS::TheoreticalSpectrumGenerator::getSpectrum
virtual void getSpectrum(PeakSpectrum &spec, const AASequence &peptide, Int min_charge, Int max_charge) const
returns a spectrum with the ion types, that are set in the tool parameters
OpenMS::IntList
std::vector< Int > IntList
Vector of signed integers.
Definition: ListUtils.h:58
OpenMS::SimpleSVM::Prediction::label
Int label
Predicted class label.
Definition: SimpleSVM.h:77
SpectrumSettings.h
OpenMS::Constants::PROTON_MASS_U
const double PROTON_MASS_U
OpenMS::ProgressLogger::endProgress
void endProgress() const
Ends the progress display.
Constants.h
OpenMS::PeptideIndexing::UNEXPECTED_RESULT
Definition: PeptideIndexing.h:135
OpenMS::EnzymaticDigestion::setMissedCleavages
void setMissedCleavages(Size missed_cleavages)
Sets the number of missed cleavages for the digestion (default is 0). This setting is ignored when lo...
OpenMS::MSExperiment::size
Size size() const
Definition: MSExperiment.h:127
OpenMS::NLargest
NLargest removes all but the n largest peaks.
Definition: NLargest.h:54
OpenMS::PeptideIdentification::setHits
void setHits(const std::vector< PeptideHit > &hits)
Sets the peptide hits.
SimpleSearchEngine::AnnotatedHit::score
double score
Definition: SimpleSearchEngine.cpp:89
OpenMS::DefaultParamHandler
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:91
OpenMS::ProteinIdentification::SearchParameters::fragment_mass_tolerance_ppm
bool fragment_mass_tolerance_ppm
Mass tolerance unit of fragment ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:115
OpenMS::DateTime::now
static DateTime now()
Returns the current date and time.
WindowMower.h
OpenMS::ProteinIdentification::SearchParameters::charges
String charges
The allowed charges for the search.
Definition: ProteinIdentification.h:109
SimpleSearchEngine::AnnotatedHit::sequence
StringView sequence
Definition: SimpleSearchEngine.cpp:87
OpenMS::Int
int Int
Signed integer type.
Definition: Types.h:102
OpenMS::DigestionEnzymeDB::getEnzyme
const DigestionEnzymeType * getEnzyme(const String &name) const
Definition: DigestionEnzymeDB.h:99
OpenMS::PeptideIdentification::assignRanks
void assignRanks()
Sorts the hits by score and assigns ranks according to the scores.
OpenMS::SimpleSVM::log2_gamma_
std::vector< double > log2_gamma_
Definition: SimpleSVM.h:149
OpenMS
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
OpenMS::SimpleSVM::n_parts_
Size n_parts_
Number of partitions for cross-validation.
Definition: SimpleSVM.h:146
SimpleSearchEngine::registerOptionsAndFlags_
void registerOptionsAndFlags_() override
Sets the valid command line options (with argument) and flags (without argument).
Definition: SimpleSearchEngine.cpp:107
OpenMS::Normalizer::filterPeakMap
void filterPeakMap(PeakMap &exp) const
OpenMS::ProteinIdentification::SearchParameters::variable_modifications
std::vector< String > variable_modifications
Allowed variable modifications.
Definition: ProteinIdentification.h:112
OpenMS::ProteinIdentification::SearchParameters::mass_type
PeakMassType mass_type
Mass type of the peaks.
Definition: ProteinIdentification.h:110
OpenMS::ProgressLogger
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:54
OpenMS::SimpleSVM::nodes_
std::vector< std::vector< struct svm_node > > nodes_
Values of predictors (LIBSVM format)
Definition: SimpleSVM.h:131
OpenMS::MetaInfoInterface::setMetaValue
void setMetaValue(const String &name, const DataValue &value)
Sets the DataValue corresponding to a name.
OpenMS::MzMLFile::load
void load(const String &filename, PeakMap &map)
Loads a map from a MzML file. Spectra and chromatograms are sorted by default (this can be disabled u...
OpenMS::ModifiedPeptideGenerator::applyVariableModifications
static void applyVariableModifications(const std::vector< ResidueModification >::const_iterator &var_mods_begin, const std::vector< ResidueModification >::const_iterator &var_mods_end, const AASequence &peptide, Size max_variable_mods_per_peptide, std::vector< AASequence > &all_modified_peptides, bool keep_original=true)
double
FASTAFile.h
OpenMS::ProteinIdentification::SearchParameters::db
String db
The used database.
Definition: ProteinIdentification.h:106
OpenMS::StringView
StringView provides a non-owning view on an existing string.
Definition: String.h:480
Param.h
OpenMS::PeptideIdentification::setHigherScoreBetter
void setHigherScoreBetter(bool value)
sets the peptide score orientation
OpenMS::DefaultParamHandler::setParameters
void setParameters(const Param &param)
Sets the parameters.
SimpleSearchEngine::getModifications_
vector< ResidueModification > getModifications_(StringList modNames)
Definition: SimpleSearchEngine.cpp:169
OpenMS::ModifiedPeptideGenerator::applyFixedModifications
static void applyFixedModifications(const std::vector< ResidueModification >::const_iterator &fixed_mods_begin, const std::vector< ResidueModification >::const_iterator &fixed_mods_end, AASequence &peptide)
Normalizer.h
OpenMS::PeptideHit::setCharge
void setCharge(Int charge)
sets the charge of the peptide
OpenMS::VersionInfo::getVersion
static String getVersion()
Return the version number of OpenMS.
OpenMS::DefaultParamHandler::getParameters
const Param & getParameters() const
Non-mutable access to the parameters.
SimpleSearchEngine::AnnotatedHit
Slimmer structure as storing all scored candidates in PeptideHit objects takes too much space.
Definition: SimpleSearchEngine.cpp:85
OpenMS::ProteinIdentification::SearchParameters::fragment_mass_tolerance
double fragment_mass_tolerance
Mass tolerance of fragment ions (Dalton or ppm)
Definition: ProteinIdentification.h:114
OpenMS::Deisotoper::deisotopeAndSingleCharge
static void deisotopeAndSingleCharge(MSSpectrum &spectra, double fragment_tolerance, bool fragment_unit_ppm, int min_charge=1, int max_charge=3, bool keep_only_deisotoped=false, unsigned int min_isopeaks=3, unsigned int max_isopeaks=10, bool make_single_charged=true, bool annotate_charge=false)
OpenMS::PeptideIndexing::ExitCodes
ExitCodes
Exit codes.
Definition: PeptideIndexing.h:129
OpenMS::StringList
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:73
OpenMS::MSSpectrum::sortByPosition
void sortByPosition()
Lexicographically sorts the peaks by their position.
OpenMS::PeptideHit::setSequence
void setSequence(const AASequence &sequence)
sets the peptide sequence
OpenMS::ProteinIdentification::SearchParameters::fixed_modifications
std::vector< String > fixed_modifications
Used fixed modifications.
Definition: ProteinIdentification.h:111
SimpleSearchEngine::AnnotatedHit::hasBetterScore
static bool hasBetterScore(const AnnotatedHit &a, const AnnotatedHit &b)
Definition: SimpleSearchEngine.cpp:92
ModificationsDB.h
NLargest.h
MSExperiment.h
OpenMS::ProteinIdentification::SearchParameters::precursor_mass_tolerance
double precursor_mass_tolerance
Mass tolerance of precursor ions (Dalton or ppm)
Definition: ProteinIdentification.h:116
OpenMS::IdXMLFile::store
void store(String filename, const std::vector< ProteinIdentification > &protein_ids, const std::vector< PeptideIdentification > &peptide_ids, const String &document_id="")
Stores the data in an idXML file.
OpenMS::SignedSize
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
OpenMS::TheoreticalSpectrumGenerator
Generates theoretical spectra with various options.
Definition: TheoreticalSpectrumGenerator.h:63
OpenMS::PeptideIdentification
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:62
RNPxlDeisotoper.h
OpenMS::EnzymaticDigestion::digestUnmodified
Size digestUnmodified(const StringView &sequence, std::vector< StringView > &output, Size min_length=1, Size max_length=0) const
Performs the enzymatic digestion of an unmodified sequence.
OpenMS::SimpleSVM::printNull_
static void printNull_(const char *)
Dummy function to suppress LIBSVM output.
Definition: SimpleSVM.h:155
OpenMS::FASTAFile::load
static void load(const String &filename, std::vector< FASTAEntry > &data)
loads a FASTA file given by 'filename' and stores the information in 'data'
OpenMS::ThresholdMower::filterPeakMap
void filterPeakMap(PeakMap &exp)
OpenMS::ProteaseDigestion
Class for the enzymatic digestion of proteins.
Definition: ProteaseDigestion.h:60
OpenMS::MSExperiment::end
Iterator end()
Definition: MSExperiment.h:167
SimpleSearchEngine::postProcessHits_
void postProcessHits_(const PeakMap &exp, vector< vector< AnnotatedHit > > &annotated_hits, vector< ProteinIdentification > &protein_ids, vector< PeptideIdentification > &peptide_ids, Size top_hits, const vector< ResidueModification > &fixed_modifications, const vector< ResidueModification > &variable_modifications, Size max_variable_mods_per_peptide)
Definition: SimpleSearchEngine.cpp:231
OpenMS::PeptideIndexing::DATABASE_EMPTY
Definition: PeptideIndexing.h:132
OpenMS::PeptideIndexing
Refreshes the protein references for all peptide hits in a vector of PeptideIdentifications and adds ...
Definition: PeptideIndexing.h:123
OpenMS::SimpleSVM::model_
struct svm_model * model_
Pointer to SVM model (LIBSVM format)
Definition: SimpleSVM.h:140
OpenMS::PeakFileOptions
Options for loading files containing peak data.
Definition: PeakFileOptions.h:47
OpenMS::MSExperiment::getPrimaryMSRunPath
void getPrimaryMSRunPath(StringList &toFill) const
get the file path to the first MS run
OpenMS::ThresholdMower
ThresholdMower removes all peaks below a threshold.
Definition: ThresholdMower.h:51
SimpleSearchEngine::main_
ExitCodes main_(int, const char **) override
The actual "main" method. main_() is invoked by main().
Definition: SimpleSearchEngine.cpp:325
OpenMS::TOPPBase::ExitCodes
ExitCodes
Exit codes.
Definition: TOPPBase.h:155
PeptideIndexing.h
SimpleSearchEngine::AnnotatedHit::fragment_annotations
std::vector< PeptideHit::PeakAnnotation > fragment_annotations
Definition: SimpleSearchEngine.cpp:90
OpenMS::ModificationsDB::getAllSearchModifications
void getAllSearchModifications(std::vector< String > &modifications) const
Collects all modifications that can be used for identification searches.
OpenMS::PeakFileOptions::clearMSLevels
void clearMSLevels()
clears the MS levels
SimpleSearchEngine::SimpleSearchEngine
SimpleSearchEngine()
Definition: SimpleSearchEngine.cpp:99
OpenMS::ProteinIdentification::SearchParameters::precursor_mass_tolerance_ppm
bool precursor_mass_tolerance_ppm
Mass tolerance unit of precursor ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:117
OpenMS::Param
Management and storage of parameters / INI files.
Definition: Param.h:74
OpenMS::SimpleSVM::PredictorMap
std::map< String, std::vector< double > > PredictorMap
Mapping from predictor name to vector of predictor values.
Definition: SimpleSVM.h:71
OpenMS::SimpleSVM::Prediction
SVM prediction result.
Definition: SimpleSVM.h:74
OpenMS::AASequence
Representation of a peptide/protein sequence.
Definition: AASequence.h:107
IDFilter.h
ModifiedPeptideGenerator.h
OpenMS::MSExperiment::ConstIterator
std::vector< SpectrumType >::const_iterator ConstIterator
Non-mutable iterator.
Definition: MSExperiment.h:113
ResidueModification.h
OpenMS::AASequence::fromString
static AASequence fromString(const String &s, bool permissive=true)
create AASequence object by parsing an OpenMS string
TheoreticalSpectrumGenerator.h
OpenMS::SimpleSVM::predictor_names_
std::vector< String > predictor_names_
Names of predictors in the model (excluding uninformative ones)
Definition: SimpleSVM.h:143
OpenMS::Constants::C13C12_MASSDIFF_U
const double C13C12_MASSDIFF_U
OpenMS::PeptideIndexing::EXECUTION_OK
Definition: PeptideIndexing.h:131
OpenMS::MSSpectrum
The representation of a 1D spectrum.
Definition: MSSpectrum.h:66
OpenMS::ProteinIdentification::SearchParameters
Search parameters of the DB search.
Definition: ProteinIdentification.h:103
OpenMS::PeptideIdentification::setMZ
void setMZ(double mz)
sets the MZ of the MS2 spectrum
SimpleSearchEngine::preprocessSpectra_
void preprocessSpectra_(PeakMap &exp, double fragment_mass_tolerance, bool fragment_mass_tolerance_unit_ppm)
Definition: SimpleSearchEngine.cpp:183
OpenMS::TOPPBase::main
ExitCodes main(int argc, const char **argv)
Main routine of all TOPP applications.
StandardTypes.h
OpenMS::NLargest::filterPeakSpectrum
void filterPeakSpectrum(PeakSpectrum &spectrum)
main
int main(int argc, const char **argv)
Definition: SimpleSearchEngine.cpp:655
OpenMS::HyperScore::compute
static double compute(double fragment_mass_tolerance, bool fragment_mass_tolerance_unit_ppm, const PeakSpectrum &exp_spectrum, const PeakSpectrum &theo_spectrum)
OpenMS::MzMLFile::getOptions
PeakFileOptions & getOptions()
Mutable access to the options for loading/storing.
OpenMS::ProgressLogger::setLogType
void setLogType(LogType type) const
Sets the progress log that should be used. The default type is NONE!
TOPPBase.h
OpenMS::PeptideIdentification::setScoreType
void setScoreType(const String &type)
sets the peptide score type
MSSpectrum.h
OpenMS::IdXMLFile
Used to load and store idXML files.
Definition: IdXMLFile.h:63
ThresholdMower.h
OpenMS::PeptideHit::setScore
void setScore(double score)
sets the PSM score
OpenMS::PeptideHit
Representation of a peptide hit.
Definition: PeptideHit.h:54