libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
psmspecpeptidoms.cpp
Go to the documentation of this file.
1/**
2 * \file pappsomspp/processing/cbor/psm/evalscan/psmspecglob.cpp
3 * \date 19/07/2025
4 * \author Olivier Langella
5 * \brief compute specglob alignment on scan's PSM
6 */
7
8/*******************************************************************************
9 * Copyright (c) 2025 Olivier Langella <Olivier.Langella@universite-paris-saclay.fr>.
10 *
11 * This file is part of PAPPSOms-tools.
12 *
13 * PAPPSOms-tools is free software: you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation, either version 3 of the License, or
16 * (at your option) any later version.
17 *
18 * PAPPSOms-tools is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with PAPPSOms-tools. If not, see <http://www.gnu.org/licenses/>.
25 *
26 ******************************************************************************/
27
28
29#include "psmspecpeptidoms.h"
30
39
41 pappso::cbor::CborStreamWriter *cbor_output_p,
42 const QJsonObject &parameters)
43 : PsmFileScanProcessAndCopy(buffer_scan_size, cbor_output_p, "SpecPeptidOms")
44{
45 m_specpeptidomsParameters = parameters;
46
47 if(parameters.value("fragment_tolerance_unit").toString() == "dalton")
48 {
49 m_fragmentTolerance = pappso::PrecisionFactory::getDaltonInstance(
50 parameters.value("fragment_tolerance").toDouble());
51 }
52 else if(parameters.value("fragment_tolerance_unit").toString() == "ppm")
53 {
54 m_fragmentTolerance =
55 pappso::PrecisionFactory::getPpmInstance(parameters.value("fragment_tolerance").toDouble());
56 }
57
58 QJsonObject spectrum_param = parameters.value("spectrum").toObject();
59
60 m_minimumMz = spectrum_param.value("minimum_mz").toDouble();
61 m_nMostIntense = spectrum_param.value("n_most_intense").toInt();
62 m_deisotope = spectrum_param.value("deisotope").toBool();
63
64 m_aaCode.addAaModification('C', AaModification::getInstance("MOD:00397"));
65
66
67 m_maxInterpretationsPerSpectrum = parameters.value("max_interpretations_per_spectrum").toInt();
68
70}
71
75
76void
86
87void
95
96void
98{
99 if(!m_decoyPrefix.isEmpty())
100 {
101 // generate decoy sequences
102
103 PsmProtein new_decoy_protein;
104 for(auto &it_protein : m_proteinMap.getProteinMap())
105 {
106 new_decoy_protein = it_protein.second;
107 new_decoy_protein.protein_sp =
108 std::make_shared<Protein>(*new_decoy_protein.protein_sp.get());
109 new_decoy_protein.protein_sp.get()->reverse();
110 new_decoy_protein.protein_sp.get()->setAccession(
111 m_decoyPrefix + new_decoy_protein.protein_sp.get()->getAccession());
112 new_decoy_protein.isTarget = false;
113 }
114 }
116}
117
118
119void
121 [[maybe_unused]])
122{
123
124 QCborMap cbor_peptidoms_parameters = QCborValue::fromJsonValue(m_specpeptidomsParameters).toMap();
125
126 m_cborParameterMap.insert(QString("peptidoms"), cbor_peptidoms_parameters);
127
128 mp_cborOutput->append("parameter_map");
129 mp_cborOutput->writeCborMap(m_cborParameterMap);
130}
131
137
138const pappso::AaCode &
143
144
146 const pappso::cbor::psm::PsmSpecPeptidOms &psm_specpeptidoms,
147 pappso::PrecisionPtr fragment_tolerance [[maybe_unused]])
148 : CborScanMapBase(psm_specpeptidoms)
149{
150 mp_psmSpecPeptidOms = &psm_specpeptidoms;
151
152 m_decoyPrefix = mp_psmSpecPeptidOms->m_decoyPrefix;
153}
154
158
159void
161{
162 // qWarning() << "PsmSpecPeptidOmsScan::process " << keys();
163 if(!keys().contains("id"))
164 {
165 throw pappso::PappsoException(QObject::tr("missing scan id"));
166 }
167 if(keys().contains("psm_list"))
168 {
170
171 // qWarning() << "PsmSpecPeptidOmsScan::process "
172 // << qualified_mass_spectrum.get()->getMassSpectrumId().getSpectrumIndex();
173
174 mp_psmSpecPeptidOms->filterMassSpectrum(
175 *(qualified_mass_spectrum.get()->getMassSpectrumSPtr().get()));
176
177 pappso::specpeptidoms::SpOMSSpectrumCsp experimental_spectrum =
178 std::make_shared<pappso::specpeptidoms::SpOMSSpectrum>(
179 *qualified_mass_spectrum.get(),
180 mp_psmSpecPeptidOms->m_fragmentTolerance,
181 mp_psmSpecPeptidOms->getAaCode());
182
183
184 pappso::specpeptidoms::SemiGlobalAlignment semi_global_alignment(
185 mp_psmSpecPeptidOms->m_scoreValues,
186 mp_psmSpecPeptidOms->m_fragmentTolerance,
187 mp_psmSpecPeptidOms->m_aaCode);
188
189
190 QCborArray new_psm_arr;
191 for(QCborValue cbor_psm : value("psm_list").toArray())
192 {
193 QCborMap old_cbor_psm_map = cbor_psm.toMap();
194
195
196 if(!old_cbor_psm_map.keys().contains("proforma"))
197 {
199 QObject::tr("missing proforma in psm %1").arg(old_cbor_psm_map.keys().size()));
200 }
202 old_cbor_psm_map.value("proforma").toString());
203
204
205 pappso::specpeptidoms::SpOMSProtein protein(old_cbor_psm_map.value("proforma").toString(),
206 peptide_sp.get()->getSequence(),
207 mp_psmSpecPeptidOms->m_aaCode);
208
209 sequenceAlignment(false,
210 old_cbor_psm_map,
211 new_psm_arr,
212 experimental_spectrum,
213 semi_global_alignment,
214 &protein);
215
216 if(!m_decoyPrefix.isEmpty())
217 {
218 QString sequence = peptide_sp.get()->getSequence();
219 std::reverse(sequence.begin(), sequence.end());
220
222 m_decoyPrefix + old_cbor_psm_map.value("proforma").toString(),
223 sequence,
224 mp_psmSpecPeptidOms->m_aaCode);
225
227 old_cbor_psm_map,
228 new_psm_arr,
229 experimental_spectrum,
230 semi_global_alignment,
231 &protein_rev);
232 }
233
234
235 // qWarning() << "new_psm_arr.size()=" << new_psm_arr.size();
236 }
237
238 // insert(QString("psm_list"), new_psm_arr);
239 remove(QString("psm_list"));
240 insert(QString("psm_list"), new_psm_arr);
241
243 }
244 // qWarning() << "PsmSpecPeptidOmsScan::process end";
245}
246
247
248void
250 bool is_reverse,
251 const QCborMap &old_cbor_psm_map,
252 QCborArray &new_psm_arr,
253 pappso::specpeptidoms::SpOMSSpectrumCsp &experimental_spectrum,
254 pappso::specpeptidoms::SemiGlobalAlignment &semi_global_alignment,
255 const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
256{
257
258 std::vector<pappso::specpeptidoms::Location> locations;
259 std::vector<double> potential_mass_errors;
260 const QString &sequence = protein_ptr->getSequence();
261 // qWarning() << "PsmSpecPeptidOmsScan::process " << peptide_sequence;
262 // do not take into account peptide containing too much redundancy
264 {
265 if((sequence.size() >= 8) &&
267 return;
268
269 semi_global_alignment.fastAlign(*experimental_spectrum.get(), protein_ptr);
270 // qDebug() << "Completed fastAlign";
271 locations = semi_global_alignment.getLocationSaver().getLocations();
272
273 qDebug() << "locations.size():" << locations.size();
274 for(auto loc : locations)
275 {
276 QCborMap new_cbor_psm;
277 qDebug() << "beginning=" << loc.beginning << "length=" << loc.length
278 << "tree=" << loc.tree << "score=" << loc.score
279 << "protein=" << loc.proteinPtr->getAccession();
280 semi_global_alignment.preciseAlign(
281 *experimental_spectrum.get(), loc.proteinPtr, loc.beginning, loc.length);
282 qDebug() << "Completed preciseAlign";
283
284 pappso::specpeptidoms::Alignment best_alignment =
285 semi_global_alignment.getBestAlignment();
286 /* qDebug() << "Best alignment" << best_alignment.interpretation
287 << best_alignment.score << "SPC" << best_alignment.SPC
288 << "beginning" << best_alignment.beginning << "end"
289 << best_alignment.end;*/
290 if(best_alignment.end > (std::size_t)sequence.size())
291 {
292 throw pappso::ExceptionOutOfRange(QString("best_alignment.end > "
293 "(std::size_t)sequence.size() : %1 %2")
294 .arg(best_alignment.end)
295 .arg(sequence.size()));
296 }
297 if(best_alignment.begin_shift > 0 || best_alignment.end_shift > 0 ||
298 best_alignment.shifts.size() > 0)
299 {
300 qDebug();
301 potential_mass_errors =
303 mp_psmSpecPeptidOms->m_aaCode, best_alignment, sequence);
304 qDebug();
305 semi_global_alignment.postProcessingAlign(*experimental_spectrum.get(),
306 loc.proteinPtr,
307 loc.beginning,
308 loc.length,
309 potential_mass_errors);
310
311 qDebug() << "semi_global_alignment.getBestAlignment()";
312 pappso::specpeptidoms::Alignment best_post_processed_alignment =
313 semi_global_alignment.getBestAlignment();
314 if(best_post_processed_alignment.SPC > best_alignment.SPC)
315 {
316 qDebug() << "Best post-processed alignment"
317 << best_post_processed_alignment.m_peptideModel.toInterpretation()
318 << best_post_processed_alignment.score << "SPC"
319 << best_post_processed_alignment.SPC;
320 storeAlignment(is_reverse,
321 old_cbor_psm_map,
322 new_cbor_psm,
323 protein_ptr->getAccession(),
324 best_post_processed_alignment);
325 }
326 else
327 {
328 qDebug() << "no improvement in post-processing";
329 storeAlignment(is_reverse,
330 old_cbor_psm_map,
331 new_cbor_psm,
332 protein_ptr->getAccession(),
333 best_alignment);
334 }
335 }
336 else
337 {
338
339 storeAlignment(is_reverse,
340 old_cbor_psm_map,
341 new_cbor_psm,
342 protein_ptr->getAccession(),
343 best_alignment);
344 }
345
346 if(!new_cbor_psm.isEmpty())
347 {
348 new_psm_arr.push_back(new_cbor_psm);
349 }
350 }
351 }
352}
353
354void
356{
357 std::size_t max_psm = mp_psmSpecPeptidOms->m_maxInterpretationsPerSpectrum;
358
359
360 struct sortPsmResults
361 {
362 qint64 score;
363 QCborMap psm;
364 };
365
366 QCborArray old_psm_arr = value("psm_list").toArray();
367 QCborArray new_psm_arr;
368 // for(QCborValue cbor_psm : old_psm_arr) {
369
370
371 std::vector<sortPsmResults> sort_psm_list;
372 for(auto it_psm : old_psm_arr)
373 {
374 QCborMap psm_map = it_psm.toMap();
375 qint64 score =
376 psm_map.value("eval").toMap().value("peptidoms").toMap().value("score").toInteger();
377 sort_psm_list.push_back({score, psm_map});
378 }
379 qDebug();
380 std::sort(sort_psm_list.begin(), sort_psm_list.end(), [](sortPsmResults &a, sortPsmResults &b) {
381 return a.score > b.score;
382 });
383
384
385 auto it_end = sort_psm_list.begin() + max_psm;
386
387
388 qDebug() << sort_psm_list.size();
389 for(auto it = sort_psm_list.begin(); it != sort_psm_list.end() && it != it_end; it++)
390 {
391
392 new_psm_arr.append(it->psm);
393 qDebug();
394 }
395
396 // qWarning() << new_psm_arr.size();
397 remove(QString("psm_list"));
398 insert(QString("psm_list"), new_psm_arr);
399}
400
401
402void
404 bool is_reverse,
405 const QCborMap &old_cbor_psm,
406 QCborMap &new_cbor_psm,
407 const QString &accession,
408 const pappso::specpeptidoms::Alignment &alignment)
409{
410 qDebug() << accession;
411
412 if(alignment.score > 0)
413 {
414
415 QString peptide_key(alignment.m_peptideModel.toProForma());
416
417
418 /*
419
420 {"proforma":"LHGAGADDADADD",
421 "protein_list":
422 [{
423 "accession": "GRMZM2G108267_P01",
424 "positions": [
425 337
426 ]
427 }
428 {
429 "accession": "GRMZM2G108267_P02",
430 "positions": [
431 337
432 ]
433 }
434 ]
435 ,
436 "eval":{
437 "matcher": {
438 "score": 193077
439 }
440 }
441 }
442 */
443
444 // do not take into account peptide containing too much redundancy
446 {
447 // qDebug() << "peptide_key=" << peptide_key;
448 new_cbor_psm.insert(QString("proforma"), peptide_key);
449
450 // copy protein list in new psm
451 new_cbor_psm.insert(QString("protein_list"), old_cbor_psm.value("protein_list"));
452
453 fixPositionStart(is_reverse, new_cbor_psm, alignment.getPositionStart());
454
455 QCborMap cbor_eval;
456
457 QCborMap cbor_peptidoms;
458
459 // auto it_protein_pos =
460 // one_peptide_result.map_protein2positions.insert({accession, QCborArray()});
461 // it_protein_pos.first->second.append((qint64)alignment.getPositionStart());
462
463 cbor_peptidoms.insert(QString("bracket"), alignment.m_peptideModel.toInterpretation());
464 cbor_peptidoms.insert(QString("spc"), (qint64)alignment.SPC);
465 cbor_peptidoms.insert(QString("score"), alignment.score);
466 cbor_peptidoms.insert(QString("nam"), alignment.getNonAlignedMass());
467
468
469 // copy matcher eval in new psm
470 cbor_eval.insert(QString("matcher"), old_cbor_psm.value("eval").toMap().value("matcher"));
471
472
473 // store peptidoms eval in new psm
474 cbor_eval.insert(QString("peptidoms"), cbor_peptidoms);
475
476 new_cbor_psm.insert(QString("eval"), cbor_eval);
477 }
478 }
479}
480void
482 QCborMap &new_cbor_psm,
483 std::size_t offset_position) const
484{
485 QCborArray new_protein_list;
486 for(auto qcbor_protein : new_cbor_psm.value("protein_list").toArray())
487 {
488 QCborMap protein;
489
490 QCborArray positions;
491 if(is_reverse)
492 {
493 protein.insert(QString("accession"),
494 m_decoyPrefix + qcbor_protein.toMap().value("accession").toString());
495 for(auto position : qcbor_protein.toMap().value("positions").toArray())
496 {
497 positions.append(position.toInteger() + (qint64)offset_position);
498 }
499 }
500 else
501 {
502
503 protein.insert(QString("accession"), qcbor_protein.toMap().value("accession"));
504 for(auto position : qcbor_protein.toMap().value("positions").toArray())
505 {
506 positions.append(position.toInteger() + (qint64)offset_position);
507 }
508 }
509
510 protein.insert(QString("positions"), positions.toCborValue());
511
512 new_protein_list.append(protein);
513 }
514 new_cbor_psm.insert(QString("protein_list"), new_protein_list);
515}
collection of integer code for each amino acid 0 => null 1 to 20 => amino acid sorted by there mass (...
Definition aacode.h:44
static AaModificationP getInstance(const QString &accession)
Trace & filter(Trace &data_points) const override
get all the datapoints and remove different isotope and add their intensity and change to charge = 1 ...
keep N datapoints form the greatest intensities to the lowest
Definition filterpass.h:96
Trace & filter(Trace &data_points) const override
Trace & filter(Trace &trace) const override
Class to represent a mass spectrum.
static PeptideSp parseString(const QString &pepstr)
virtual void setStatus(const QString &status)=0
current status of the process
overrides QCborStreamWriter base class to provide convenient functions
CborScanMapBase(const PsmFileScanProcess &psm_file_scan_process)
pappso::QualifiedMassSpectrumSPtr getCurrentQualifiedMassSpectrumSPtr() const
PsmFileScanProcessAndCopy(std::size_t buffer_scan_size, CborStreamWriter *cbor_output_p, const QString &operation)
virtual void proteinMapReady(pappso::UiMonitorInterface &monitor) override
virtual void processBufferScanDone(pappso::UiMonitorInterface &monitor) override
void sequenceAlignment(bool is_reverse, const QCborMap &old_cbor_psm_map, QCborArray &new_psm_arr, pappso::specpeptidoms::SpOMSSpectrumCsp &experimental_spectrum, pappso::specpeptidoms::SemiGlobalAlignment &semi_global_alignment, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
void storeAlignment(bool is_reverse, const QCborMap &old_cbor_psm, QCborMap &new_cbor_psm, const QString &accession, const pappso::specpeptidoms::Alignment &alignment)
void fixPositionStart(bool is_reverse, QCborMap &new_cbor_psm, std::size_t offset_position) const
const PsmSpecPeptidOms * mp_psmSpecPeptidOms
PsmSpecPeptidOmsScan(const PsmSpecPeptidOms &psm_specpeptidoms, pappso::PrecisionPtr fragment_tolerance)
void parameterMapReady(pappso::UiMonitorInterface &monitor) override
virtual void proteinMapReady(pappso::UiMonitorInterface &monitor) override
PsmSpecPeptidOms(std::size_t buffer_scan_size, CborStreamWriter *cbor_output_p, const QJsonObject &parameters)
const pappso::AaCode & getAaCode() const
void filterMassSpectrum(pappso::MassSpectrum &mass_spectrum) const
CborScanMapBase * newCborScanMap() override
virtual void processBufferScanDone(pappso::UiMonitorInterface &monitor) override
std::vector< Location > getLocations() const
Returns a vector containing the saved locations.
const Alignment & getBestAlignment() const
Returns a const ref to m_best_alignment.
void postProcessingAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, std::size_t beginning, std::size_t length, const std::vector< double > &shifts)
performs the post-processing : generates corrected spectra and align them
void preciseAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, const std::size_t beginning, const std::size_t length)
performs the second alignment search between a protein subsequence and a spectrum.
void fastAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr)
perform the first alignment search between a protein sequence and a spectrum. The member location hea...
static bool checkSequenceDiversity(const QString &sequence, std::size_t window, std::size_t minimum_aa_diversity)
check that the sequence has a minimum of amino acid checkSequenceDiversity
static std::vector< double > getPotentialMassErrors(const pappso::AaCode &aa_code, const Alignment &alignment, const QString &protein_seq)
Returns a list of the potential mass errors corresponding to the provided alignment in the provided p...
LocationSaver getLocationSaver() const
Returns a copy of m_location_saver.
const QString & getSequence() const
std::shared_ptr< const SpOMSSpectrum > SpOMSSpectrumCsp
std::shared_ptr< QualifiedMassSpectrum > QualifiedMassSpectrumSPtr
std::shared_ptr< const Peptide > PeptideSp
const PrecisionBase * PrecisionPtr
Definition precision.h:122
std::shared_ptr< Protein > protein_sp
double getNonAlignedMass() const
convenient function to get the remaining non explained mass shift
std::size_t getPositionStart() const
get position of start on the protein sequence