libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
enzyme.cpp
Go to the documentation of this file.
1/*******************************************************************************
2 * Copyright (c) 2015 Olivier Langella <Olivier.Langella@moulon.inra.fr>.
3 *
4 * This file is part of the PAPPSOms++ library.
5 *
6 * PAPPSOms++ is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * PAPPSOms++ is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with PAPPSOms++. If not, see <http://www.gnu.org/licenses/>.
18 *
19 * Contributors:
20 * Olivier Langella <Olivier.Langella@moulon.inra.fr> - initial API and
21 *implementation
22 ******************************************************************************/
23
24#include "enzyme.h"
25#include <QStringList>
26#include <QDebug>
28// #include <iostream>
29
30namespace pappso
31{
33{
34 m_recognitionSite.setPattern("([KR])([^P])");
35 m_miscleavage = 0;
36
37
38 char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
39 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
40 m_wildCardX.assign(std::begin(vv1), std::end(vv1));
41
42 char vv2[] = {'N', 'D'};
43 m_wildCardB.assign(std::begin(vv2), std::end(vv2));
44
45 char vv3[] = {'Q', 'E'};
46 m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
47}
48
49Enzyme::Enzyme(const QString &recognition_site)
50{
51 m_recognitionSite.setPattern(recognition_site);
52 m_miscleavage = 0;
53
54
55 char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
56 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
57 m_wildCardX.assign(std::begin(vv1), std::end(vv1));
58
59 char vv2[] = {'N', 'D'};
60 m_wildCardB.assign(std::begin(vv2), std::end(vv2));
61
62 char vv3[] = {'Q', 'E'};
63 m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
64}
65
69
70void
71Enzyme::setMiscleavage(unsigned int miscleavage)
72{
73 m_miscleavage = miscleavage;
74}
75unsigned int
77{
78 return m_miscleavage;
79}
80void
81Enzyme::setMaxPeptideVariantListSize(std::size_t max_peptide_variant_list_size)
82{
83 m_maxPeptideVariantListSize = max_peptide_variant_list_size;
84}
85
86void
87Enzyme::eat(std::int8_t sequence_database_id,
88 const ProteinSp &protein_sp,
89 bool is_decoy,
90 EnzymeProductInterface &enzyme_product) const
91{
92 /*
93 * for aa in self.aa_to_cut:
94 seq = seq.replace(aa, aa + ' ')
95 seq_stack = []
96 for s in seq.strip().split(' '):
97 seq_stack.append(s)
98 if len(seq_stack) > self.misscleavage + 1:
99 seq_stack.pop(0)
100 s2 = ""
101 for s_miss in seq_stack[::-1]:
102 s2 = s_miss + s2
103 yield s2
104 */
105 qDebug() << "Enzyme::eat begin ";
106 const QString sequence = protein_sp.get()->getSequence();
107 qDebug() << sequence;
108 QStringList peptide_list;
109 int pos = 0;
110 int peptide_start = 0;
111 int peptide_size = sequence.size();
112 QRegularExpressionMatch match_recognition_site = m_recognitionSite.match(sequence, pos);
113 while(match_recognition_site.hasMatch())
114 {
115 pos = match_recognition_site.capturedStart(0);
116 peptide_size = pos + match_recognition_site.captured(1).length() - peptide_start;
117 // qDebug() << "pos=" << pos << " peptide_start=" << peptide_start << "
118 // peptide_size=" << peptide_size << " " <<
119 // sequence.mid(peptide_start,peptide_size);
120 if(peptide_size > 0)
121 {
122 peptide_list.append(sequence.mid(peptide_start, peptide_size));
123 }
124 peptide_start += peptide_size;
125 pos = peptide_start; // all peptides MUST be consecutive
126 match_recognition_site = m_recognitionSite.match(sequence, pos);
127 }
128 peptide_size = sequence.size() - peptide_start;
129 if(peptide_size > 0)
130 {
131 peptide_list.append(sequence.mid(peptide_start, peptide_size));
132 }
133
134 unsigned int start = 1;
135 bool is_nter = true;
136 foreach(const QString &peptide, peptide_list)
137 {
138 // enzyme_product.setPeptide(sequence_database_id, protein_sp,is_decoy,
139 // peptide, start,is_nter,0, false);
140 sanityCheck(enzyme_product,
141 sequence_database_id,
142 protein_sp,
143 is_decoy,
144 peptide,
145 start,
146 is_nter,
147 0,
148 false);
149 is_nter = false;
150 start += peptide.size();
151 }
152
153 unsigned int miscleavage_i = 0;
154 while(miscleavage_i < m_miscleavage)
155 {
156 miscleavage_i++;
157 qDebug() << "miscleavage_i=" << miscleavage_i;
158 int chunk_number = miscleavage_i + 1;
159 unsigned int start = 1;
160 bool is_nter = true;
161
162 for(auto i = 0; i < peptide_list.size(); ++i)
163 {
164 qDebug() << "start=" << start;
165 QStringList peptide_mis_list;
166 for(auto j = 0; (j < chunk_number) && ((i + j) < peptide_list.size()); j++)
167 {
168 peptide_mis_list << peptide_list.at(i + j);
169 }
170 if(peptide_mis_list.size() == chunk_number)
171 {
172 // enzyme_product.setPeptide(sequence_database_id,
173 // protein_sp,is_decoy, peptide_mis_list.join(""), start,is_nter,
174 // miscleavage_i, false);
175 sanityCheck(enzyme_product,
176 sequence_database_id,
177 protein_sp,
178 is_decoy,
179 peptide_mis_list.join(""),
180 start,
181 is_nter,
182 miscleavage_i,
183 false);
184 }
185 is_nter = false;
186 start += peptide_list.at(i).size();
187 }
188 }
189}
190
191void
192Enzyme::replaceWildcards(std::vector<std::string> *p_peptide_variant_list) const
193{
194 std::string new_peptide = p_peptide_variant_list->at(0);
195 qDebug() << "Enzyme::replaceWildcards begin " << new_peptide.c_str();
196 std::vector<std::string> old_peptide_variant_list;
197 old_peptide_variant_list.assign(p_peptide_variant_list->begin(), p_peptide_variant_list->end());
198
199
200 for(char wildcard : {'X', 'B', 'Z'})
201 {
202
203 std::size_t position = new_peptide.find(wildcard);
204 if(position == std::string::npos)
205 {
206 continue;
207 }
208 else
209 {
210 p_peptide_variant_list->clear();
211 /*
212 new_peptide[position] = 'A';
213 p_peptide_variant_list->push_back(new_peptide);
214 break;
215 */
216
217 const std::vector<char> *p_x_replace_wildcard = nullptr;
218 if(wildcard == 'X')
219 {
220 p_x_replace_wildcard = &m_wildCardX;
221 }
222 else if(wildcard == 'B')
223 {
224 p_x_replace_wildcard = &m_wildCardB;
225 }
226 else if(wildcard == 'Z')
227 {
228 p_x_replace_wildcard = &m_wildCardZ;
229 }
230
231 if(p_x_replace_wildcard != nullptr)
232 {
233 for(std::string orig_peptide : old_peptide_variant_list)
234 {
235 for(char replace : *p_x_replace_wildcard)
236 {
237 orig_peptide[position] = replace;
238 p_peptide_variant_list->push_back(orig_peptide);
239 }
240 }
241 }
242 else
243 {
244 throw ExceptionNotPossible(QObject::tr("x_replace_wildcard is empty"));
245 }
246 // new_peptide[position] = 'A';
247 // p_peptide_variant_list->push_back(new_peptide);
248 // p_peptide_variant_list->resize(1);
249 // std::cerr << "Enzyme::replaceWildcards begin
250 // p_peptide_variant_list.size()=" << p_peptide_variant_list->size()
251 // <<
252 // endl;
253 break;
254 }
255 }
256 std::vector<std::string>().swap(
257 old_peptide_variant_list); // clear old_peptide_variant_list reallocating
258
259
260 qDebug() << "Enzyme::replaceWildcards end " << new_peptide.c_str();
261}
262
263void
264Enzyme::setTakeOnlyFirstWildcard(bool take_only_first_wildcard)
265{
266 m_takeOnlyFirstWildcard = take_only_first_wildcard;
267}
268
269
270void
272 std::int8_t sequence_database_id,
273 const ProteinSp &protein_sp,
274 bool is_decoy,
275 const PeptideStr &peptide,
276 unsigned int start,
277 bool is_nter,
278 unsigned int missed_cleavage_number,
279 bool semi_enzyme) const
280{
281 if(peptide.contains('X') || peptide.contains('B') || peptide.contains('Z'))
282 {
283
284 std::vector<std::string> peptide_variant_list;
285 peptide_variant_list.push_back(peptide.toStdString());
286
287 while((peptide_variant_list.at(0).find('X') != std::string::npos) ||
288 (peptide_variant_list.at(0).find('B') != std::string::npos) ||
289 (peptide_variant_list.at(0).find('Z') != std::string::npos))
290 {
291 replaceWildcards(&peptide_variant_list);
292 if(peptide_variant_list.size() > m_maxPeptideVariantListSize)
293 {
294 peptide_variant_list.resize(m_maxPeptideVariantListSize);
295 peptide_variant_list.shrink_to_fit();
296 }
297 }
298
299 // peptide_variant_list.resize(2);
301 {
302 enzyme_product.setPeptide(sequence_database_id,
303 protein_sp,
304 is_decoy,
305 QString(peptide_variant_list.at(0).c_str()),
306 start,
307 is_nter,
308 missed_cleavage_number,
309 semi_enzyme);
310 }
311 else
312 {
313 std::string peptide_variant = peptide_variant_list.back();
314 while(peptide_variant_list.size() > 0)
315 {
316 enzyme_product.setPeptide(sequence_database_id,
317 protein_sp,
318 is_decoy,
319 QString(peptide_variant.c_str()),
320 start,
321 is_nter,
322 missed_cleavage_number,
323 semi_enzyme);
324 peptide_variant_list.pop_back();
325 if(peptide_variant_list.size() > 0)
326 {
327 peptide_variant = peptide_variant_list.back();
328 }
329 }
330 }
331 std::vector<std::string>().swap(
332 peptide_variant_list); // clear peptide_variant_list reallocating
333 }
334 else
335 {
336 enzyme_product.setPeptide(sequence_database_id,
337 protein_sp,
338 is_decoy,
339 peptide,
340 start,
341 is_nter,
342 missed_cleavage_number,
343 semi_enzyme);
344 }
345}
346
347const QRegularExpression &
352} // namespace pappso
virtual void setPeptide(std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme)=0
function to give the products of a protein digestion by an enzyme
QRegularExpression m_recognitionSite
example with a kinase == [K,R]
Definition enzyme.h:89
std::size_t m_maxPeptideVariantListSize
Definition enzyme.h:93
unsigned int getMiscleavage() const
get the maximum number of missed cleavage allowed in the digestion
Definition enzyme.cpp:76
Enzyme()
build the default enzyme (trypsin) with recognition_site = "([KR])([^P])"
Definition enzyme.cpp:32
void setMiscleavage(unsigned int miscleavage)
sets the maximum number of missed cleavage allowed in the digestion
Definition enzyme.cpp:71
std::vector< char > m_wildCardB
Definition enzyme.h:97
std::vector< char > m_wildCardZ
Definition enzyme.h:98
std::vector< char > m_wildCardX
Definition enzyme.h:96
void sanityCheck(EnzymeProductInterface &enzyme_product, std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme) const
Definition enzyme.cpp:271
const QRegularExpression & getQRegExpRecognitionSite() const
Definition enzyme.cpp:348
void replaceWildcards(std::vector< std::string > *p_peptide_variant_list) const
Definition enzyme.cpp:192
void setTakeOnlyFirstWildcard(bool take_only_first_wildcard)
take only first m_takeOnlyFirstWildcard
Definition enzyme.cpp:264
void eat(std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, EnzymeProductInterface &enzyme_product) const
digest a protein into enzyme products
Definition enzyme.cpp:87
unsigned int m_miscleavage
Definition enzyme.h:90
bool m_takeOnlyFirstWildcard
Definition enzyme.h:91
void setMaxPeptideVariantListSize(std::size_t max_peptide_variant_list_size)
if there are wildcards in the protein sequence : restrict the number of possible peptide sequences
Definition enzyme.cpp:81
tries to keep as much as possible monoisotopes, removing any possible C13 peaks and changes multichar...
Definition aa.cpp:39
QString PeptideStr
A type definition for PeptideStr.
Definition types.h:55
std::shared_ptr< const Protein > ProteinSp
shared pointer on a Protein object
Definition protein.h:47