ProteoWizard
Digestion.hpp
Go to the documentation of this file.
1 //
2 // $Id: Digestion.hpp 3979 2012-09-28 21:49:17Z chambm $
3 //
4 //
5 // Original author: Matt Chambers <matt.chambers .@. vanderbilt.edu>
6 //
7 // Copyright 2006 Louis Warschaw Prostate Cancer Center
8 // Cedars Sinai Medical Center, Los Angeles, California 90048
9 // Copyright 2008 Vanderbilt University - Nashville, TN 37232
10 //
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 //
15 // http://www.apache.org/licenses/LICENSE-2.0
16 //
17 // Unless required by applicable law or agreed to in writing, software
18 // distributed under the License is distributed on an "AS IS" BASIS,
19 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 // See the License for the specific language governing permissions and
21 // limitations under the License.
22 //
23 
24 
25 #ifndef _DIGESTION_HPP_
26 #define _DIGESTION_HPP_
27 
28 
30 #include "pwiz/data/common/cv.hpp"
32 #include "Peptide.hpp"
33 #include <boost/regex.hpp>
34 #include "boost/shared_ptr.hpp"
35 #include <string>
36 #include <limits>
37 #include <set>
38 
39 
40 namespace pwiz {
41 namespace proteome {
42 
43 
44 using namespace pwiz::cv;
45 
46 
47 /// peptide subclass that contains extra metadata provided by digestion
49 {
50  public:
51 
52  DigestedPeptide(const std::string& sequence);
53  DigestedPeptide(const char* sequence);
54 
55  DigestedPeptide(std::string::const_iterator begin,
56  std::string::const_iterator end,
57  size_t offset,
58  size_t missedCleavages,
59  bool NTerminusIsSpecific,
60  bool CTerminusIsSpecific,
61  std::string NTerminusPrefix = "",
62  std::string CTerminusSuffix = "");
63 
65  size_t offset,
66  size_t missedCleavages,
67  bool NTerminusIsSpecific,
68  bool CTerminusIsSpecific,
69  std::string NTerminusPrefix = "",
70  std::string CTerminusSuffix = "");
71 
73  DigestedPeptide& operator=(const DigestedPeptide&);
74  virtual ~DigestedPeptide();
75 
76  /// returns the zero-based offset of the N terminus of the peptide
77  /// in the polypeptide from which it was digested
78  size_t offset() const;
79 
80  /// returns the number of missed cleavage sites in the peptide
81  size_t missedCleavages() const;
82 
83  /// returns the number of termini that matched to the digestion rules
84  size_t specificTermini() const;
85 
86  /// returns true iff the N terminus matched the digestion rules
87  bool NTerminusIsSpecific() const;
88 
89  /// returns true iff the C terminus matched the digestion rules
90  bool CTerminusIsSpecific() const;
91 
92  /// returns residue preceding digestion site
93  std::string NTerminusPrefix() const;
94 
95  /// returns residue following digestion site
96  std::string CTerminusSuffix() const;
97 
98  /// returns true iff peptide sequences, masses, and all digestion metadata are equal
99  bool operator==(const DigestedPeptide& rhs) const;
100 
101  private:
102  size_t offset_;
106  std::string NTerminusPrefix_;
107  std::string CTerminusSuffix_;
108 };
109 
110 
111 /// enumerates the peptides from proteolytic digestion of a polypeptide or protein;
113 {
114  public:
115 
116  /// sets the number of peptide termini that must match to a digestion motif
117  /// note: castable to int; i.e. non=0, semi=1, fully=2
118  enum PWIZ_API_DECL Specificity
119  {
120  NonSpecific = 0, /// neither termini must match digestion motif(s)
121  SemiSpecific = 1, /// either or both termini must match digestion motif(s)
122  FullySpecific = 2 /// both termini must match digestion motif(s)
123  };
124 
125  /// sets constraints for valid peptides produced by iterating the digestion
127  {
129 
130  //double minimumMass;
131  //double maximumMass;
132 
135 
136  Specificity minimumSpecificity;
137 
139 
140  Config(int maximumMissedCleavages = 100000,
141  //double minimumMass = 0,
142  //double maximumMass = 100000,
143  int minimumLength = 0,
144  int maximumLength = 100000,
145  Specificity minimumSpecificity = FullySpecific,
146  bool clipNTerminalMethionine = true);
147  };
148 
149  /// returns the set of predefined cleavage agents defined in the PSI-MS CV
150  static const std::set<CVID>& getCleavageAgents();
151 
152  /// returns the names of the set of predefined cleavage agents defined in the PSI-MS CV
153  static const std::vector<std::string>& getCleavageAgentNames();
154 
155  /// returns the cvid of the specified cleavage agent using a case-insensitive search,
156  /// or CVID_Unknown if the agent is not found
157  static CVID getCleavageAgentByName(const std::string& agentName);
158 
159  /// returns the cvid of the specified cleavage agent looking it up by the Perl regular expression,
160  /// or CVID_Unknown if the agent is not found (the regex pattern must match exactly)
161  static CVID getCleavageAgentByRegex(const std::string& agentRegex);
162 
163  /// returns the official PSI Perl regular expression defining the places in a
164  /// polypeptide or protein that the agent will cut.
165  static const std::string& getCleavageAgentRegex(CVID agentCvid);
166 
167  /// returns a modified version of a cleavage agent regex where any ambiguous AA symbols (BJXZ)
168  /// are augmented with their unambiguous counterparts (e.g. B -> [BND])
169  static std::string disambiguateCleavageAgentRegex(const std::string& cleavageAgentRegex);
170 
171  /// specifies digestion occurs by a commonly used cleavage agent
172  Digestion(const Peptide& polypeptide,
173  CVID cleavageAgent,
174  const Config& config = Config());
175 
176  /// specifies digestion occurs by a combination of commonly used cleavage agents
177  Digestion(const Peptide& polypeptide,
178  const std::vector<CVID>& cleavageAgents,
179  const Config& config = Config());
180 
181  /// specifies digestion occurs by a user-specified, zero-width Perl regular expression
182  /// example: "(?<=K)" means "cleaves after K"
183  /// example: "((?<=D))|((?=D))" means "cleaves before or after D"
184  /// example: "(?=[DE])" means "cleaves before D or E"
185  /// example: "(?<=[FYWLKR])(?!P)" means "cleaves after any single residue from FYWLKR except when it is followed by P"
186  Digestion(const Peptide& polypeptide,
187  const boost::regex& cleavageAgentRegex,
188  const Config& config = Config());
189 
190  /// specifies digestion occurs by a combination of user-specified, zero-width Perl regular expressions
191  /// example: "(?<=K)" means "cleaves after K"
192  /// example: "((?<=D))|((?=D))" means "cleaves before or after D"
193  /// example: "(?=[DE])" means "cleaves before D or E"
194  /// example: "(?<=[FYWLKR])(?!P)" means "cleaves after any single residue from FYWLKR except when it is followed by P"
195  Digestion(const Peptide& polypeptide,
196  const std::vector<boost::regex>& cleavageAgentRegexes,
197  const Config& config = Config());
198 
199  /// returns all instances of the given peptide in the polypeptide under digestion;
200  /// note: the filters set in Digestion::Config are respected!
201  std::vector<DigestedPeptide> find_all(const Peptide& peptide) const;
202 
203  /// returns the first instance of the given peptide in the polypeptide under digestion;
204  /// if offsetHint is provided, the search will begin at that offset;
205  /// throws runtime_error if no instance of the peptide is found;
206  /// note: the filters set in Digestion::Config are respected!
207  DigestedPeptide find_first(const Peptide& peptide, size_t offsetHint = 0) const;
208 
209 
210  ~Digestion();
211 
212 
213  private:
214  class Impl; // forward-declared for const_iterator
215 
216  public:
217 
218  /// provides forward-only, read-only iteration to enumerate peptides
220  {
221  public:
222  const_iterator(const const_iterator& rhs);
223  ~const_iterator();
224 
225  const DigestedPeptide& operator*() const;
226  const DigestedPeptide* operator->() const;
227  const_iterator& operator++();
228  const_iterator operator++(int);
229  bool operator!=(const const_iterator& that) const;
230  bool operator==(const const_iterator& that) const;
231 
232  typedef std::forward_iterator_tag iterator_category;
234  typedef size_t difference_type;
235  typedef value_type* pointer;
237 
238  private:
239  const_iterator();
240  const_iterator(const Digestion& digestion);
241 
242  friend class Digestion;
243  friend class Digestion::Impl;
244 
245  class Impl;
246  boost::shared_ptr<Impl> impl_;
247  };
248 
249  const_iterator begin() const;
250  const_iterator end() const;
251 
252  private:
253  friend class const_iterator;
254  friend class const_iterator::Impl;
255  boost::shared_ptr<Impl> impl_;
256 };
257 
258 
259 } // namespace proteome
260 } // namespace pwiz
261 
262 
263 #endif // _DIGESTION_HPP_