SeqAn3  3.2.0-rc.1
The Modern C++ library for sequence analysis.
io/sam_file/detail/cigar.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <algorithm>
16 #include <concepts>
17 #include <ranges>
18 #include <seqan3/std/charconv>
19 #include <sstream>
20 
29 
30 namespace seqan3::detail
31 {
34 struct view_equality_fn
35 {
37  template <std::ranges::forward_range rng1_type, std::ranges::forward_range rng2_type>
38  constexpr bool operator()(rng1_type && rng1, rng2_type && rng2) const
39  {
40  return std::ranges::equal(rng1, rng2);
41  }
42 };
43 
83 template <typename reference_char_type, typename query_char_type>
84 [[nodiscard]] constexpr cigar::operation map_aligned_values_to_cigar_op(reference_char_type const reference_char,
85  query_char_type const query_char,
86  bool const extended_cigar)
87  requires seqan3::detail::weakly_equality_comparable_with<reference_char_type, gap>
88  && seqan3::detail::weakly_equality_comparable_with<query_char_type, gap>
89 {
90  constexpr std::array<char, 6> operators{'M', 'D', 'I', 'P', 'X', '='}; // contains the possible cigar operators.
91  uint8_t key = (static_cast<uint8_t>(reference_char == gap{}) << 1) | static_cast<uint8_t>(query_char == gap{});
92  if (extended_cigar && (key == 0)) // in extended format refine the substitution operator to match/mismatch.
93  key |= ((1 << 2) | static_cast<uint8_t>(query_char == reference_char)); // maps to [4, 5].
94 
95  return assign_char_to(operators[key], cigar::operation{});
96 }
97 
105 inline void update_alignment_lengths(int32_t & ref_length,
106  int32_t & seq_length,
107  char const cigar_operation,
108  uint32_t const cigar_count)
109 {
110  switch (cigar_operation)
111  {
112  case 'M':
113  case '=':
114  case 'X':
115  ref_length += cigar_count, seq_length += cigar_count;
116  break;
117  case 'D':
118  case 'N':
119  ref_length += cigar_count;
120  break;
121  case 'I':
122  seq_length += cigar_count;
123  break;
124  case 'S':
125  case 'H':
126  case 'P':
127  break; // no op (soft-clipping or padding does not increase either length)
128  default:
129  throw format_error{"Illegal cigar operation: " + std::string{cigar_operation}};
130  }
131 }
132 
147 template <typename cigar_input_type>
148 inline std::tuple<std::vector<cigar>, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input)
149 {
150  std::vector<cigar> operations{};
151  std::array<char, 20> buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t
152  char cigar_operation{};
153  uint32_t cigar_count{};
154  int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query
155 
156  // transform input into a single input view if it isn't already
157  auto cigar_view = cigar_input | views::single_pass_input;
158 
159  // parse the rest of the cigar
160  // -------------------------------------------------------------------------------------------------------------
161  while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty
162  {
163  auto buff_end = (std::ranges::copy(cigar_view | detail::take_until_or_throw(!is_digit), buffer.data())).out;
164  cigar_operation = *std::ranges::begin(cigar_view);
165  ++std::ranges::begin(cigar_view);
166 
167  if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{})
168  throw format_error{"Corrupted cigar string encountered"};
169 
170  update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
171  operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation));
172  }
173 
174  return {operations, ref_length, seq_length};
175 }
176 
214 template <seqan3::detail::pairwise_alignment alignment_type>
215 [[nodiscard]] inline std::vector<cigar> get_cigar_vector(alignment_type && alignment,
216  uint32_t const query_start_pos = 0,
217  uint32_t const query_end_pos = 0,
218  bool const extended_cigar = false)
219 {
220  using std::get;
221 
222  auto & ref_seq = get<0>(alignment);
223  auto & query_seq = get<1>(alignment);
224 
225  if (ref_seq.size() != query_seq.size())
226  throw std::logic_error{"The aligned sequences must have the same length."};
227 
228  std::vector<cigar> result{};
229 
230  if (!ref_seq.size())
231  return result; // return empty string if sequences are empty
232 
233  // Add (S)oft-clipping at the start of the read
234  if (query_start_pos)
235  result.emplace_back(query_start_pos, 'S'_cigar_operation);
236 
237  // Create cigar string from alignment
238  // -------------------------------------------------------------------------
239  // initialize first operation and count value:
240  cigar::operation operation{map_aligned_values_to_cigar_op(ref_seq[0], query_seq[0], extended_cigar)};
241  uint32_t count{0};
242 
243  // go through alignment columns
244  for (auto column : views::zip(ref_seq, query_seq))
245  {
246  cigar::operation next_op =
247  map_aligned_values_to_cigar_op(std::get<0>(column), std::get<1>(column), extended_cigar);
248 
249  if (operation == next_op)
250  {
251  ++count;
252  }
253  else
254  {
255  result.emplace_back(count, operation);
256  operation = next_op;
257  count = 1;
258  }
259  }
260 
261  // append last cigar element
262  result.emplace_back(count, operation);
263 
264  // Add (S)oft-clipping at the end of the read
265  if (query_end_pos)
266  result.emplace_back(query_end_pos, 'S'_cigar_operation);
267 
268  return result;
269 }
270 
276 [[nodiscard]] inline std::string get_cigar_string(std::vector<cigar> const & cigar_vector)
277 {
278  std::string result{};
279  std::ranges::for_each(cigar_vector,
280  [&result](auto & cig)
281  {
282  result.append(static_cast<std::string_view>(cig.to_string()));
283  });
284  return result;
285 }
286 
320 template <seqan3::detail::pairwise_alignment alignment_type>
321 [[nodiscard]] inline std::string get_cigar_string(alignment_type && alignment,
322  uint32_t const query_start_pos = 0,
323  uint32_t const query_end_pos = 0,
324  bool const extended_cigar = false)
325 {
326  return get_cigar_string(get_cigar_vector(alignment, query_start_pos, query_end_pos, extended_cigar));
327 }
328 
362 template <seqan3::aligned_sequence ref_seq_type, seqan3::aligned_sequence query_seq_type>
363 [[nodiscard]] inline std::string get_cigar_string(ref_seq_type && ref_seq,
364  query_seq_type && query_seq,
365  uint32_t const query_start_pos = 0,
366  uint32_t const query_end_pos = 0,
367  bool const extended_cigar = false)
368 {
369  return get_cigar_string(std::tie(ref_seq, query_seq), query_start_pos, query_end_pos, extended_cigar);
370 }
371 
394 template <seqan3::detail::writable_pairwise_alignment alignment_type>
395 inline void alignment_from_cigar(alignment_type & alignment, std::vector<cigar> const & cigar_vector)
396 {
397  using std::get;
398  auto current_ref_pos = std::ranges::begin(get<0>(alignment));
399  auto current_read_pos = std::ranges::begin(get<1>(alignment));
400 
401  for (auto [cigar_count, cigar_operation] : cigar_vector)
402  {
403  // ignore since alignment shall contain sliced sequences
404  if (('S'_cigar_operation == cigar_operation) || ('H'_cigar_operation == cigar_operation))
405  continue;
406 
407  assert(('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation)
408  || ('X'_cigar_operation == cigar_operation) || ('D'_cigar_operation == cigar_operation)
409  || ('N'_cigar_operation == cigar_operation) || ('I'_cigar_operation == cigar_operation)
410  || ('P'_cigar_operation == cigar_operation)); // checked during IO
411 
412  if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation)
413  || ('X'_cigar_operation == cigar_operation))
414  {
415  std::ranges::advance(current_ref_pos, cigar_count);
416  std::ranges::advance(current_read_pos, cigar_count);
417  }
418  else if (('D'_cigar_operation == cigar_operation) || ('N'_cigar_operation == cigar_operation))
419  {
420  // insert gaps into read
421 
422  assert(std::distance(current_read_pos, std::ranges::end(get<1>(alignment))) >= 0);
423  current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
424  ++current_read_pos;
425  std::ranges::advance(current_ref_pos, cigar_count);
426  }
427  else if (('I'_cigar_operation == cigar_operation)) // Insert gaps into ref
428  {
429  assert(std::ranges::distance(current_ref_pos, std::ranges::end(get<0>(alignment))) >= 0);
430  current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
431  ++current_ref_pos;
432  std::ranges::advance(current_read_pos, cigar_count);
433  }
434  else if (('P'_cigar_operation == cigar_operation)) // skip padding
435  {
436  current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
437  ++current_ref_pos;
438 
439  current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
440  ++current_read_pos;
441  }
442  }
443 }
444 
447 struct access_restrictor_fn
448 {
450  template <typename chr_t>
451  [[noreturn]] chr_t operator()(chr_t) const
452  {
453  throw std::logic_error{"Access is not allowed because there is no sequence information."};
454  }
455 };
456 
457 } // namespace seqan3::detail
Provides the seqan3::cigar alphabet.
The <charconv> header from C++17's standard library.
constexpr derived_type & assign_char(char_type const chr) noexcept requires(!std
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:163
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: alphabet/cigar/cigar.hpp:98
The <concepts> header from C++20's standard library.
T distance(T... args)
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition: alphabet/concept.hpp:524
requires requires
The rank_type of the semi-alphabet; defined as the return type of seqan3::to_rank....
Definition: alphabet/concept.hpp:164
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
constexpr auto is_digit
Checks whether c is a digital character.
Definition: predicate.hpp:262
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: type_pack/traits.hpp:164
constexpr auto single_pass_input
A view adapter that decays most of the range properties and adds single pass behavior.
Definition: single_pass_input.hpp:348
constexpr auto zip
A view adaptor that takes several views and returns tuple-like values from every i-th element of each...
Definition: zip.hpp:573
constexpr auto const & get(configuration< configs_t... > const &config) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: configuration.hpp:415
Provides seqan3::detail::pairwise_alignment and seqan3::detail::writable_pairwise_alignment.
Provides character predicates for tokenisation.
The <ranges> header from C++20's standard library.
Provides seqan3::views::single_pass_input.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
Auxiliary for pretty printing of exception messages.
Provides seqan3::tuple_like.
Provides seqan3::views::zip.