SeqAn3  3.2.0
The Modern C++ library for sequence analysis.
fm_index.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <algorithm>
16 #include <filesystem>
17 #include <ranges>
18 
19 #include <sdsl/suffix_trees.hpp>
20 
26 
27 namespace seqan3::detail
28 {
32 struct fm_index_validator
33 {
48  template <semialphabet alphabet_t, text_layout text_layout_mode_, std::ranges::range text_t>
49  static void validate(text_t && text)
50  {
51  if constexpr (text_layout_mode_ == text_layout::single)
52  {
53  static_assert(std::ranges::bidirectional_range<text_t>, "The text must model bidirectional_range.");
54  static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
55  "The alphabet of the text collection must be convertible to the alphabet of the index.");
56  static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
57 
58  if (std::ranges::empty(text))
59  throw std::invalid_argument("The text to index cannot be empty.");
60  }
61  else
62  {
63  static_assert(std::ranges::bidirectional_range<text_t>,
64  "The text collection must model bidirectional_range.");
65  static_assert(std::ranges::bidirectional_range<std::ranges::range_reference_t<text_t>>,
66  "The elements of the text collection must model bidirectional_range.");
67  static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
68  "The alphabet of the text collection must be convertible to the alphabet of the index.");
69  static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
70 
71  if (std::ranges::empty(text))
72  throw std::invalid_argument("The text collection to index cannot be empty.");
73  }
74  static_assert(alphabet_size<range_innermost_value_t<text_t>> <= 256, "The alphabet is too big.");
75  }
76 };
77 } // namespace seqan3::detail
78 
79 namespace seqan3
80 {
81 
83 // forward declarations
84 template <typename index_t>
85 class fm_index_cursor;
86 
87 template <typename index_t>
88 class bi_fm_index_cursor;
89 
90 namespace detail
91 {
92 template <semialphabet alphabet_t, text_layout text_layout_mode_, detail::sdsl_index sdsl_index_type_>
93 class reverse_fm_index;
94 }
96 
130  sdsl::csa_wt<sdsl::wt_blcd<sdsl::bit_vector, // Wavelet tree type
131  sdsl::rank_support_v<>,
132  sdsl::select_support_scan<>,
133  sdsl::select_support_scan<0>>,
134  16, // Sampling rate of the suffix array
135  10'000'000, // Sampling rate of the inverse suffix array
136  sdsl::sa_order_sa_sampling<>, // How to sample positions in the suffix array (text VS SA sampling)
137  sdsl::isa_sampling<>, // How to sample positons in the inverse suffix array
138  sdsl::plain_byte_alphabet>; // How to represent the alphabet
139 
146 
185 template <semialphabet alphabet_t,
186  text_layout text_layout_mode_,
187  detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
188 class fm_index
189 {
190 private:
195  using sdsl_index_type = sdsl_index_type_;
199  using sdsl_char_type = typename sdsl_index_type::alphabet_type::char_type;
201  using sdsl_sigma_type = typename sdsl_index_type::alphabet_type::sigma_type;
203 
204  friend class detail::reverse_fm_index<alphabet_t, text_layout_mode_, sdsl_index_type_>;
205 
207  sdsl_index_type index;
208 
210  sdsl::sd_vector<> text_begin;
212  sdsl::select_support_sd<1> text_begin_ss;
214  sdsl::rank_support_sd<1> text_begin_rs;
215 
217  template <typename output_it_t, typename sequence_t>
218  static output_it_t copy_sequence_ranks_shifted_by_one(output_it_t output_it, sequence_t && sequence)
219  {
220  constexpr size_t sigma = alphabet_size<alphabet_t>;
221  constexpr size_t max_sigma = text_layout_mode_ == text_layout::single ? 256u : 255u;
222 
223  constexpr auto warn_if_rank_out_of_range = [](uint8_t const rank)
224  {
225  if (rank >= max_sigma - 1) // same as rank + 1 >= max_sigma but without overflow
226  throw std::out_of_range("The input text cannot be indexed, because for full"
227  "character alphabets the last one/two values are reserved"
228  "(single sequence/collection).");
229  };
230 
232  output_it,
233  [&warn_if_rank_out_of_range](auto const & chr)
234  {
235  uint8_t const rank = seqan3::to_rank(chr);
236  if constexpr (sigma >= max_sigma)
237  warn_if_rank_out_of_range(rank);
238  return rank + 1;
239  })
240  .out;
241  }
242 
262  template <std::ranges::range text_t>
263  requires (text_layout_mode_ == text_layout::single)
264  void construct(text_t && text)
265  {
266  detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
267 
268  // TODO:
269  // * check what happens in sdsl when constructed twice!
270  // * choose between in-memory/external and construction algorithms
271  // * sdsl construction currently only works for int_vector, std::string and char *, not ranges in general
272  // uint8_t largest_char = 0;
273  sdsl::int_vector<8> tmp_text(std::ranges::distance(text));
274 
275  // copy ranks into tmp_text
276  copy_sequence_ranks_shifted_by_one(std::ranges::begin(tmp_text), text | std::views::reverse);
277 
278  sdsl::construct_im(index, tmp_text, 0);
279 
280  // TODO: would be nice but doesn't work since it's private and the public member references are const
281  // index.m_C.resize(largest_char);
282  // index.m_C.shrink_to_fit();
283  // index.m_sigma = largest_char;
284  }
285 
287  template <std::ranges::range text_t>
288  requires (text_layout_mode_ == text_layout::collection)
289  void construct(text_t && text, bool reverse = false)
290  {
291  detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
292 
293  std::vector<size_t> text_sizes;
294 
295  for (auto && t : text)
296  text_sizes.push_back(std::ranges::distance(t));
297 
298  size_t const number_of_texts{text_sizes.size()};
299 
300  // text size including delimiters
301  size_t const text_size = std::accumulate(text_sizes.begin(), text_sizes.end(), number_of_texts);
302 
303  if (number_of_texts == text_size)
304  throw std::invalid_argument("A text collection that only contains empty texts cannot be indexed.");
305 
306  constexpr auto sigma = alphabet_size<alphabet_t>;
307 
308  // Instead of creating a bitvector of size `text_size`, setting the bits to 1 and then compressing it, we can
309  // use the `sd_vector_builder(text_size, number_of_ones)` because we know the parameters and the 1s we want to
310  // set are in a strictly increasing order. This inplace construction of the compressed vector saves memory.
311  sdsl::sd_vector_builder builder(text_size, number_of_texts);
312  size_t prefix_sum{0};
313 
314  for (auto && size : text_sizes)
315  {
316  builder.set(prefix_sum);
317  prefix_sum += size + 1;
318  }
319 
320  text_begin = sdsl::sd_vector<>(builder);
321  text_begin_ss = sdsl::select_support_sd<1>(&text_begin);
322  text_begin_rs = sdsl::rank_support_sd<1>(&text_begin);
323 
324  // last text in collection needs no delimiter if we have more than one text in the collection
325  sdsl::int_vector<8> tmp_text(text_size - (number_of_texts > 1));
326 
327  constexpr uint8_t delimiter = sigma >= 255 ? 255 : sigma + 1;
328 
329  auto copy_join_with = [](auto output_it, auto && collection)
330  {
331  // this is basically std::views::join() with a delimiter
332  auto collection_it = std::ranges::begin(collection);
333  auto const collection_sentinel = std::ranges::end(collection);
334  if (collection_it == collection_sentinel)
335  return;
336 
337  output_it = copy_sequence_ranks_shifted_by_one(output_it, *collection_it);
338  ++collection_it;
339 
340  for (; collection_it != collection_sentinel; ++collection_it)
341  {
342  *output_it = delimiter;
343  ++output_it;
344  output_it = copy_sequence_ranks_shifted_by_one(output_it, *collection_it);
345  }
346  };
347 
348  // copy ranks into tmp_text
349  copy_join_with(std::ranges::begin(tmp_text), text);
350 
351  if (!reverse)
352  {
353  // we need at least one delimiter
354  if (number_of_texts == 1)
355  tmp_text.back() = delimiter;
356 
357  std::ranges::reverse(tmp_text);
358  }
359  else
360  {
361  // If only one text is in the text collection, we still need one delimiter at the end to be able to
362  // conduct rank and select queries when locating hits in the index.
363  // Also, tmp_text looks like [text|0], but after reversing we need [txet|0] to be able to add the delimiter.
364  if (number_of_texts == 1)
365  {
366  std::ranges::reverse(tmp_text.begin(), tmp_text.end() - 1);
367  tmp_text.back() = delimiter;
368  }
369  else
370  {
371  std::ranges::reverse(tmp_text);
372  }
373  }
374 
375  sdsl::construct_im(index, tmp_text, 0);
376  }
377 
378 public:
380  static constexpr text_layout text_layout_mode = text_layout_mode_;
381 
386  using alphabet_type = alphabet_t;
388  using size_type = typename sdsl_index_type::size_type;
392 
393  template <typename bi_fm_index_t>
394  friend class bi_fm_index_cursor;
395 
396  template <typename fm_index_t>
397  friend class fm_index_cursor;
398 
399  template <typename fm_index_t>
400  friend class detail::fm_index_cursor_node;
401 
405  fm_index() = default;
406 
408  fm_index(fm_index const & rhs) :
409  index{rhs.index},
410  text_begin{rhs.text_begin},
411  text_begin_ss{rhs.text_begin_ss},
412  text_begin_rs{rhs.text_begin_rs}
413  {
414  text_begin_ss.set_vector(&text_begin);
415  text_begin_rs.set_vector(&text_begin);
416  }
417 
419  fm_index(fm_index && rhs) :
420  index{std::move(rhs.index)},
421  text_begin{std::move(rhs.text_begin)},
422  text_begin_ss{std::move(rhs.text_begin_ss)},
423  text_begin_rs{std::move(rhs.text_begin_rs)}
424  {
425  text_begin_ss.set_vector(&text_begin);
426  text_begin_rs.set_vector(&text_begin);
427  }
428 
430  fm_index & operator=(fm_index rhs)
431  {
432  index = std::move(rhs.index);
433  text_begin = std::move(rhs.text_begin);
434  text_begin_ss = std::move(rhs.text_begin_ss);
435  text_begin_rs = std::move(rhs.text_begin_rs);
436 
437  text_begin_ss.set_vector(&text_begin);
438  text_begin_rs.set_vector(&text_begin);
439 
440  return *this;
441  }
442 
443  ~fm_index() = default;
444 
453  template <std::ranges::bidirectional_range text_t>
454  explicit fm_index(text_t && text)
455  {
456  construct(std::forward<text_t>(text));
457  }
459 
471  size_type size() const noexcept
472  {
473  return index.size();
474  }
475 
487  bool empty() const noexcept
488  {
489  return size() == 0;
490  }
491 
503  bool operator==(fm_index const & rhs) const noexcept
504  {
505  // (void) rhs;
506  return (index == rhs.index) && (text_begin == rhs.text_begin);
507  }
508 
520  bool operator!=(fm_index const & rhs) const noexcept
521  {
522  return !(*this == rhs);
523  }
524 
539  cursor_type cursor() const noexcept
540  {
541  return {*this};
542  }
543 
551  template <cereal_archive archive_t>
552  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
553  {
554  archive(index);
555  archive(text_begin);
556  archive(text_begin_ss);
557  text_begin_ss.set_vector(&text_begin);
558  archive(text_begin_rs);
559  text_begin_rs.set_vector(&text_begin);
560 
561  auto sigma = alphabet_size<alphabet_t>;
562  archive(sigma);
563  if (sigma != alphabet_size<alphabet_t>)
564  {
565  throw std::logic_error{"The fm_index was built over an alphabet of size " + std::to_string(sigma)
566  + " but it is being read into an fm_index with an alphabet of size "
567  + std::to_string(alphabet_size<alphabet_t>) + "."};
568  }
569 
570  bool tmp = text_layout_mode;
571  archive(tmp);
572  if (tmp != text_layout_mode)
573  {
574  throw std::logic_error{std::string{"The fm_index was built over a "}
575  + (tmp ? "text collection" : "single text")
576  + " but it is being read into an fm_index expecting a "
577  + (text_layout_mode ? "text collection." : "single text.")};
578  }
579  }
581 };
582 
587 template <std::ranges::range text_t>
588 fm_index(text_t &&) -> fm_index<range_innermost_value_t<text_t>, text_layout{range_dimension_v<text_t> != 1}>;
590 } // namespace seqan3
591 
592 namespace seqan3::detail
593 {
594 
608 template <semialphabet alphabet_t,
609  text_layout text_layout_mode,
610  detail::sdsl_index sdsl_index_type = default_sdsl_index_type>
611 class reverse_fm_index : public fm_index<alphabet_t, text_layout_mode, sdsl_index_type>
612 {
613 private:
615  template <std::ranges::range text_t>
616  void construct_(text_t && text)
617  {
618  if constexpr (text_layout_mode == text_layout::single)
619  {
620  auto reverse_text = text | std::views::reverse;
621  this->construct(reverse_text);
622  }
623  else
624  {
625  auto reverse_text = text | views::deep{std::views::reverse} | std::views::reverse;
626  this->construct(reverse_text, true);
627  }
628  }
629 
630 public:
631  using fm_index<alphabet_t, text_layout_mode, sdsl_index_type>::fm_index;
632 
634  template <std::ranges::bidirectional_range text_t>
635  explicit reverse_fm_index(text_t && text)
636  {
637  construct_(std::forward<text_t>(text));
638  }
639 };
640 
641 } // namespace seqan3::detail
T accumulate(T... args)
T begin(T... args)
The SeqAn Bidirectional FM Index Cursor.
Definition: bi_fm_index_cursor.hpp:55
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:87
The SeqAn FM Index.
Definition: fm_index.hpp:189
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
T end(T... args)
Provides the seqan3::fm_index_cursor for searching in the unidirectional seqan3::fm_index.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: alphabet/concept.hpp:155
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: search/fm_index/concept.hpp:91
sdsl_wt_index_type default_sdsl_index_type
The default FM Index Configuration.
Definition: fm_index.hpp:145
sdsl::csa_wt< sdsl::wt_blcd< sdsl::bit_vector, sdsl::rank_support_v<>, sdsl::select_support_scan<>, sdsl::select_support_scan< 0 > >, 16, 10 '000 '000, sdsl::sa_order_sa_sampling<>, sdsl::isa_sampling<>, sdsl::plain_byte_alphabet > sdsl_wt_index_type
The FM Index Configuration using a Wavelet Tree.
Definition: fm_index.hpp:138
@ single
The text is a single range.
Definition: search/fm_index/concept.hpp:93
@ collection
The text is a range of ranges.
Definition: search/fm_index/concept.hpp:95
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: type_list/traits.hpp:469
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
The basis for seqan3::alphabet, but requires only rank interface (not char).
The generic concept for a (biological) sequence.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
fm_index(text_t &&) -> fm_index< range_innermost_value_t< text_t >, text_layout
Deduces the alphabet and dimensions of the text.
Definition: fm_index.hpp:588
T push_back(T... args)
The <ranges> header from C++20's standard library.
Provides the concept for seqan3::detail::sdsl_index.
T size(T... args)
Provides seqan3::views::to_rank.
T to_string(T... args)