SeqAn3  3.2.0
The Modern C++ library for sequence analysis.
sequence_file/input.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <cassert>
16 #include <filesystem>
17 #include <fstream>
18 #include <string>
19 #include <variant>
20 #include <vector>
21 
30 #include <seqan3/io/detail/record.hpp>
31 #include <seqan3/io/exception.hpp>
41 
42 namespace seqan3
43 {
44 
45 // ----------------------------------------------------------------------------
46 // sequence_file_input_traits
47 // ----------------------------------------------------------------------------
48 
97 template <typename t>
99  requires (t v) {
104 
107 
110  };
112 
113 // ----------------------------------------------------------------------------
114 // sequence_file_input_default_traits
115 // ----------------------------------------------------------------------------
116 
133 {
141 
144 
146  template <typename _sequence_alphabet>
148 
150  using id_alphabet = char;
151 
153  template <typename _id_alphabet>
155 
158 
160  template <typename _quality_alphabet>
162 
164 };
165 
169 {
177 
181 };
182 
183 // ----------------------------------------------------------------------------
184 // sequence_file_input
185 // ----------------------------------------------------------------------------
186 
205  detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::qual>,
206  detail::type_list_of_sequence_file_input_formats valid_formats_ =
209 {
210 public:
216  using traits_type = traits_type_;
218  using selected_field_ids = selected_field_ids_;
220  using valid_formats = valid_formats_;
222  using stream_char_type = char;
224 
229 
230  static_assert(
231  []() constexpr {
232  for (field f : selected_field_ids::as_array)
233  if (!field_ids::contains(f))
234  return false;
235  return true;
236  }(),
237  "You selected a field that is not valid for sequence files, please refer to the documentation "
238  "of sequence_file_input::field_ids for the accepted values.");
239 
248  using id_type = typename traits_type::template id_container<typename traits_type::id_alphabet>;
250  using quality_type = typename traits_type::template quality_container<typename traits_type::quality_alphabet>;
253 
258 
268  using const_reference = void;
270  using size_type = size_t;
274  using iterator = detail::in_file_iterator<sequence_file_input>;
276  using const_iterator = void;
278  using sentinel = std::default_sentinel_t;
280 
295  ~sequence_file_input() = default;
296 
314  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
315  primary_stream{new std::ifstream{}, stream_deleter_default}
316  {
317  primary_stream->rdbuf()->pubsetbuf(stream_buffer.data(), stream_buffer.size());
318  static_cast<std::basic_ifstream<char> *>(primary_stream.get())
319  ->open(filename, std::ios_base::in | std::ios::binary);
320 
321  if (!primary_stream->good())
322  throw file_open_error{"Could not open file " + filename.string() + " for reading."};
323 
324  // possibly add intermediate compression stream
325  secondary_stream = detail::make_secondary_istream(*primary_stream, filename);
326 
327  // initialise format handler or throw if format is not found
328  using format_variant_t =
329  typename detail::variant_from_tags<valid_formats, detail::sequence_file_input_format_exposer>::type;
330  format_variant_t format_variant{};
331  detail::set_format(format_variant, filename);
332 
333  std::visit(
334  [&](auto && selected_format)
335  {
336  using format_t = std::remove_cvref_t<decltype(selected_format)>;
337  format = std::make_unique<selected_sequence_format<format_t>>();
338  },
339  format_variant);
340  }
341  /* NOTE(h-2): Curiously we do not need a user-defined deduction guide for the above constructor.
342  * A combination of default template parameters and auto-deduction guides works as expected,
343  * independent of whether the second/optional parameter is specified or not, i.e. it is possible
344  * to auto-deduct and overwrite a single template parameter out of the four if the optional parameter
345  * is specified and use the default otherwise.
346  */
347 
362  template <input_stream stream_t, sequence_file_input_format file_format>
363  requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
364  sequence_file_input(stream_t & stream,
365  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
366  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
367  primary_stream{&stream, stream_deleter_noop},
368  format{std::make_unique<selected_sequence_format<file_format>>()}
369  {
370  static_assert(list_traits::contains<file_format, valid_formats>,
371  "You selected a format that is not in the valid_formats of this file.");
372 
373  // possibly add intermediate compression stream
374  secondary_stream = detail::make_secondary_istream(*primary_stream);
375  }
376 
378  template <input_stream stream_t, sequence_file_input_format file_format>
379  requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
380  sequence_file_input(stream_t && stream,
381  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
382  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
383  primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
384  format{std::make_unique<selected_sequence_format<file_format>>()}
385  {
386  static_assert(list_traits::contains<file_format, valid_formats>,
387  "You selected a format that is not in the valid_formats of this file.");
388 
389  // possibly add intermediate compression stream
390  secondary_stream = detail::make_secondary_istream(*primary_stream);
391  }
393 
413  {
414  // buffer first record
415  if (!first_record_was_read)
416  {
417  read_next_record();
418  first_record_was_read = true;
419  }
420 
421  return {*this};
422  }
423 
437  sentinel end() noexcept
438  {
439  return {};
440  }
441 
465  reference front() noexcept
466  {
467  return *begin();
468  }
470 
475 
476 protected:
478 
482  record_type record_buffer;
484  std::vector<char> stream_buffer{std::vector<char>(1'000'000)};
486  std::streampos position_buffer{};
488 
496  static void stream_deleter_noop(std::basic_istream<stream_char_type> *)
497  {}
499  static void stream_deleter_default(std::basic_istream<stream_char_type> * ptr)
500  {
501  delete ptr;
502  }
503 
505  stream_ptr_t primary_stream{nullptr, stream_deleter_noop};
507  stream_ptr_t secondary_stream{nullptr, stream_deleter_noop};
508 
510  bool first_record_was_read{false};
512  bool at_end{false};
514 
515 private:
517  void read_next_record()
518  {
519  // clear the record
520  record_buffer.clear();
521 
522  // at end if we could not read further
523  if ((std::istreambuf_iterator<stream_char_type>{*secondary_stream}
525  {
526  at_end = true;
527  return;
528  }
529 
530  format->read_sequence_record(*secondary_stream, record_buffer, position_buffer, options);
531  }
532 
543  struct sequence_format_base
544  {
548  sequence_format_base() = default;
549  sequence_format_base(sequence_format_base const &) = default;
550  sequence_format_base(sequence_format_base &&) = default;
551  sequence_format_base & operator=(sequence_format_base const &) = default;
552  sequence_format_base & operator=(sequence_format_base &&) = default;
553  virtual ~sequence_format_base() = default;
555 
567  virtual void read_sequence_record(std::istream & instream,
568  record_type & record_buffer,
569  std::streampos & position_buffer,
571  };
572 
584  template <typename format_t>
585  struct selected_sequence_format final : public sequence_format_base
586  {
590  selected_sequence_format() = default;
591  selected_sequence_format(selected_sequence_format const &) = default;
592  selected_sequence_format(selected_sequence_format &&) = default;
593  selected_sequence_format & operator=(selected_sequence_format const &) = default;
594  selected_sequence_format & operator=(selected_sequence_format &&) = default;
595  ~selected_sequence_format() = default;
597 
599  void read_sequence_record(std::istream & instream,
600  record_type & record_buffer,
601  std::streampos & position_buffer,
602  sequence_file_input_options_type const & options) override
603  {
604  // read new record
605  {
606  _format.read_sequence_record(instream,
607  options,
608  position_buffer,
609  detail::get_or_ignore<field::seq>(record_buffer),
610  detail::get_or_ignore<field::id>(record_buffer),
611  detail::get_or_ignore<field::qual>(record_buffer));
612  }
613  }
614 
616  detail::sequence_file_input_format_exposer<format_t> _format{};
617  };
618 
621 
623  friend iterator;
624 };
625 
632 template <input_stream stream_type, sequence_file_input_format file_format>
633 sequence_file_input(stream_type & stream,
634  file_format const &)
636  typename sequence_file_input<>::selected_field_ids, // default field ids.
638 
640 template <input_stream stream_type, sequence_file_input_format file_format>
641 sequence_file_input(stream_type && stream,
642  file_format const &)
644  typename sequence_file_input<>::selected_field_ids, // default field ids.
646 
648 template <input_stream stream_type,
649  sequence_file_input_format file_format,
650  detail::fields_specialisation selected_field_ids>
651 sequence_file_input(stream_type && stream,
652  file_format const &,
653  selected_field_ids const &)
657 
659 template <input_stream stream_type,
660  sequence_file_input_format file_format,
661  detail::fields_specialisation selected_field_ids>
662 sequence_file_input(stream_type & stream,
663  file_format const &,
664  selected_field_ids const &)
669 
670 } // namespace seqan3
Provides seqan3::aa27, container aliases and string literals.
Provides alphabet adaptations for standard char types.
The twenty-seven letter amino acid alphabet.
Definition: aa27.hpp:46
The 15 letter DNA alphabet, containing all IUPAC smybols minus the gap.
Definition: dna15.hpp:51
The five letter DNA alphabet of A,C,G,T and the unknown character N.
Definition: dna5.hpp:51
Quality type for traditional Sanger and modern Illumina Phred scores.
Definition: phred42.hpp:47
A class for reading sequence files, e.g. FASTA, FASTQ ...
Definition: sequence_file/input.hpp:209
sequence_file_input(stream_type &stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format >>
Deduces the sequence input file type from the stream and the format.
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition: sequence_file/input.hpp:268
std::default_sentinel_t sentinel
The type returned by end().
Definition: sequence_file/input.hpp:278
sequence_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition: sequence_file/input.hpp:313
sequence_file_input & operator=(sequence_file_input const &)=delete
Copy assignment is explicitly deleted, because you can't have multiple access to the same file.
reference front() noexcept
Return the record we are currently at in the file.
Definition: sequence_file/input.hpp:465
sequence_file_input(stream_type &stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format >>
This is an overloaded member function, provided for convenience. It differs from the above function o...
requires std::same_as< typename std::remove_reference_t< stream_t >::char_type, stream_char_type > sequence_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition: sequence_file/input.hpp:364
iterator begin()
Returns an iterator to current position in the file.
Definition: sequence_file/input.hpp:412
sequence_file_input_options_type options
The options are public and its members can be set directly.
Definition: sequence_file/input.hpp:474
typename traits_type::template sequence_container< typename traits_type::sequence_alphabet > sequence_type
The type of field::seq (std::vector <seqan3::dna5> by default).
Definition: sequence_file/input.hpp:246
sequence_file_input & operator=(sequence_file_input &&)=default
Move assignment is defaulted.
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition: sequence_file/input.hpp:437
char stream_char_type
Character type of the stream(s).
Definition: sequence_file/input.hpp:222
size_t size_type
An unsigned integer type, usually std::size_t.
Definition: sequence_file/input.hpp:270
sequence_file_input(sequence_file_input const &)=delete
Copy construction is explicitly deleted, because you can't have multiple access to the same file.
sequence_file_input(stream_type &&stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format >>
This is an overloaded member function, provided for convenience. It differs from the above function o...
detail::in_file_iterator< sequence_file_input > iterator
The iterator type of this view (an input iterator).
Definition: sequence_file/input.hpp:274
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of field::id (std::string by defaul).
Definition: sequence_file/input.hpp:248
~sequence_file_input()=default
Destructor is defaulted.
sequence_file_input(sequence_file_input &&)=default
Move construction is defaulted.
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition: sequence_file/input.hpp:276
sequence_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition: sequence_file/input.hpp:216
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition: sequence_file/input.hpp:218
sequence_file_input_options< typename traits_type::sequence_legal_alphabet > sequence_file_input_options_type
The input file options type.
Definition: sequence_file/input.hpp:472
typename traits_type::template quality_container< typename traits_type::quality_alphabet > quality_type
The type of field::qual (std::vector <seqan3::phred42> by default).
Definition: sequence_file/input.hpp:250
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition: sequence_file/input.hpp:220
sequence_file_input(stream_type &&stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format >>
Deduces the sequence input file type from the stream, the format and the field ids.
sequence_record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition: sequence_file/input.hpp:256
requires std::same_as< typename std::remove_reference_t< stream_t >::char_type, stream_char_type > sequence_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: sequence_file/input.hpp:380
T data(T... args)
Provides seqan3::dna15, container aliases and string literals.
Provides seqan3::dna5, container aliases and string literals.
Provides the seqan3::sequence_file_format_genbank class.
Provides the seqan3::format_sam.
T get(T... args)
requires requires
The rank_type of the semi-alphabet; defined as the return type of seqan3::to_rank....
Definition: alphabet/concept.hpp:164
field
An enumerator for the fields used in file formats.
Definition: record.hpp:63
Provides the seqan3::detail::in_file_iterator class template.
Checks whether from can be explicitly converted to to.
A more refined container concept than seqan3::container.
The generic concept for sequence file in formats.
The requirements a traits_type for seqan3::sequence_file_input must meet.
Refines seqan3::alphabet and adds assignability.
A concept that indicates whether a writable alphabet represents quality scores.
Provides exceptions used in the I/O module.
Stream concepts.
Provides various utility functions required only for input.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::phred42 quality scores.
Provides quality alphabet composites.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_record.
T size(T... args)
A class template that holds a choice of seqan3::field.
Definition: record.hpp:128
void clear() noexcept(noexcept(std::apply(expander, std::declval< record & >())))
Clears containers that provide .clear() and (re-)initialises all other elements with = {}.
Definition: record.hpp:237
A traits type that specifies input as amino acids.
Definition: sequence_file/input.hpp:169
The default traits for seqan3::sequence_file_input.
Definition: sequence_file/input.hpp:133
char id_alphabet
The alphabet for an identifier string is char.
Definition: sequence_file/input.hpp:150
Type that contains multiple types.
Definition: type_list.hpp:29
Provides traits for seqan3::type_list.
T visit(T... args)