SeqAn3  3.1.0-rc.1
The Modern C++ library for sequence analysis.
fm_index_cursor.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 #include <seqan3/std/ranges>
17 #include <type_traits>
18 
19 #include <sdsl/suffix_trees.hpp>
20 
27 
28 namespace seqan3
29 {
33 {
35  size_t begin_position{};
37  size_t end_position{};
38 
47  friend bool operator==(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
48  {
49  return lhs.begin_position == rhs.begin_position && lhs.end_position == rhs.end_position;
50  }
51 
57  friend bool operator!=(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
58  {
59  return !(lhs == rhs);
60  }
62 };
63 
85 template <typename index_t>
87 {
88 public:
93  using index_type = index_t;
95  using size_type = typename index_type::size_type;
97 
98 private:
103  using node_type = detail::fm_index_cursor_node<index_t>;
105  using sdsl_char_type = typename index_type::sdsl_char_type;
107  using sdsl_index_type = typename index_t::sdsl_index_type;
109  using sdsl_sigma_type = typename index_type::sdsl_sigma_type;
111  using index_alphabet_type = typename index_t::alphabet_type;
117 
119  index_type const * index{nullptr};
121  size_type parent_lb{};
123  size_type parent_rb{};
125  node_type node{};
127  sdsl_sigma_type sigma{};
128 
129  template <typename _index_t>
130  friend class bi_fm_index_cursor;
131 
133  size_type offset() const noexcept
134  {
135  assert(index->index.size() > query_length());
136  return index->index.size() - query_length() - 1; // since the string is reversed during construction
137  }
138 
140  bool backward_search(sdsl_index_type const & csa,
141  sdsl_char_type const c,
142  size_type & l,
143  size_type & r) const noexcept
144  {
145  assert(l <= r && r < csa.size());
146 
147  size_type _l, _r;
148 
149  size_type cc = c;
150  if constexpr(!std::same_as<index_alphabet_type, sdsl::plain_byte_alphabet>)
151  {
152  cc = csa.char2comp[c];
153  if (cc == 0 && c > 0) // [[unlikely]]
154  return false;
155  }
156 
157  size_type const c_begin = csa.C[cc];
158  if (l == 0 && r + 1 == csa.size()) // [[unlikely]]
159  {
160  _l = c_begin;
161  _r = csa.C[cc + 1] - 1;
162  // if we use not the plain_byte_alphabet, we could return always return true here
163  }
164  else
165  {
166  _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
167  _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
168  }
169 
170  if (_r >= _l)
171  {
172  r = _r;
173  l = _l;
174  assert(r + 1 - l >= 0);
175  return true;
176  }
177  return false;
178  }
179 
180 public:
181 
186  // Default construction is necessary to make this class semi-regular and e.g., to allow construction of
187  // std::array of iterators.
188  fm_index_cursor() noexcept = default;
189  fm_index_cursor(fm_index_cursor const &) noexcept = default;
190  fm_index_cursor & operator=(fm_index_cursor const &) noexcept = default;
191  fm_index_cursor(fm_index_cursor &&) noexcept = default;
192  fm_index_cursor & operator=(fm_index_cursor &&) noexcept = default;
193  ~fm_index_cursor() = default;
194 
196  fm_index_cursor(index_t const & _index) noexcept :
197  index(&_index),
198  node({0, _index.index.size() - 1, 0, 0}),
199  sigma(_index.index.sigma - index_t::text_layout_mode)
200  {
201  assert(_index.index.size() != 0);
202  }
203  //\}
204 
217  bool operator==(fm_index_cursor const & rhs) const noexcept
218  {
219  assert(index != nullptr);
220  assert(node != rhs.node || (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb)));
221 
222  // position in the implicit suffix tree is defined by the SA interval and depth.
223  // No need to compare parent intervals
224  return node == rhs.node;
225  }
226 
239  bool operator!=(fm_index_cursor const & rhs) const noexcept
240  {
241  assert(index != nullptr);
242 
243  return !(*this == rhs);
244  }
245 
263  bool extend_right() noexcept
264  {
265  // TODO: specialize extend_right() and cycle_back() for EPR-dictionaries
266  // store all cursors at once in a private std::array of cursors
267  assert(index != nullptr);
268 
269  sdsl_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
270  size_type _lb = node.lb, _rb = node.rb;
271  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
272  {
273  ++c;
274  }
275 
276  if (c != sigma)
277  {
278  parent_lb = node.lb;
279  parent_rb = node.rb;
280  node = {_lb, _rb, node.depth + 1, c};
281  return true;
282  }
283  return false;
284  }
285 
299  template <typename char_t>
301  requires std::convertible_to<char_t, index_alphabet_type>
303  bool extend_right(char_t const c) noexcept
304  {
305  assert(index != nullptr);
306  // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
307  // for the indexed text.
308  assert(seqan3::to_rank(static_cast<index_alphabet_type>(c)) <
309  ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
310 
311  size_type _lb = node.lb, _rb = node.rb;
312 
313  sdsl_char_type c_char = seqan3::to_rank(static_cast<index_alphabet_type>(c)) + 1;
314 
315  if (backward_search(index->index, c_char, _lb, _rb))
316  {
317  parent_lb = node.lb;
318  parent_rb = node.rb;
319  node = {_lb, _rb, node.depth + 1, c_char};
320  return true;
321  }
322  return false;
323  }
324 
326  template <typename char_type>
328  requires detail::is_char_adaptation_v<char_type>
330  bool extend_right(char_type const * cstring) noexcept
331  {
333  }
334 
351  template <std::ranges::range seq_t>
352  bool extend_right(seq_t && seq) noexcept
353  {
354  static_assert(std::ranges::forward_range<seq_t>, "The query must model forward_range.");
355  static_assert(std::convertible_to<range_innermost_value_t<seq_t>, index_alphabet_type>,
356  "The alphabet of the sequence must be convertible to the alphabet of the index.");
357 
358  assert(index != nullptr); // range must not be empty!
359 
360  size_type _lb = node.lb, _rb = node.rb;
361  size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb;
362 
363  sdsl_char_type c{};
364  size_t len{0};
365 
366  for (auto it = std::ranges::begin(seq); it != std::ranges::end(seq); ++len, ++it)
367  {
368  // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
369  // for the indexed text.
370  assert(seqan3::to_rank(static_cast<index_alphabet_type>(*it)) <
371  ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
372 
373  c = seqan3::to_rank(static_cast<index_alphabet_type>(*it)) + 1;
374 
375  new_parent_lb = _lb;
376  new_parent_rb = _rb;
377  if (!backward_search(index->index, c, _lb, _rb))
378  return false;
379  }
380 
381  parent_lb = new_parent_lb;
382  parent_rb = new_parent_rb;
383  node = {_lb, _rb, len + node.depth, c};
384  return true;
385  }
386 
413  bool cycle_back() noexcept
414  {
415  assert(index != nullptr && query_length() > 0);
416  // parent_lb > parent_rb --> invalid interval
417  assert(parent_lb <= parent_rb);
418 
419  sdsl_char_type c = node.last_char + 1;
420  size_type _lb = parent_lb, _rb = parent_rb;
421 
422  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
423  {
424  ++c;
425  }
426 
427  if (c != sigma) // Collection has additional sentinel as delimiter
428  {
429  node = {_lb, _rb, node.depth, c};
430  return true;
431  }
432  return false;
433  }
434 
450  size_type last_rank() const noexcept
451  {
452  // parent_lb > parent_rb --> invalid interval
453  assert(index != nullptr && query_length() > 0 && parent_lb <= parent_rb);
454 
455  return index->index.comp2char[node.last_char] - 1; // text is not allowed to contain ranks of 0
456  }
457 
474  {
475  assert(index != nullptr);
476 
477  return {node.lb, node.rb + 1};
478  }
479 
498  size_type query_length() const noexcept
499  {
500  assert(index != nullptr);
501  assert(node.depth != 0 || (node.lb == 0 && node.rb == index->size() - 1)); // depth == 0 -> root node
502 
503  return node.depth;
504  }
505 
523  template <std::ranges::range text_t>
524  auto path_label(text_t && text) const noexcept
526  requires (index_t::text_layout_mode == text_layout::single)
528  {
529  static_assert(std::ranges::input_range<text_t>, "The text must model input_range.");
530  static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
531  static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
532  "The alphabet types of the given text and index differ.");
533  assert(index != nullptr);
534 
535  size_type const query_begin = offset() - index->index[node.lb];
536  return text | views::slice(query_begin, query_begin + query_length());
537  }
538 
540  template <std::ranges::range text_t>
541  auto path_label(text_t && text) const noexcept
543  requires (index_t::text_layout_mode == text_layout::collection)
545  {
546  static_assert(std::ranges::input_range<text_t>, "The text collection must model input_range.");
547  static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
548  static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
549  "The alphabet types of the given text and index differ.");
550  assert(index != nullptr);
551 
552  // Position of query in concatenated text.
553  size_type const location = offset() - index->index[node.lb];
554 
555  // The rank represents the number of start positions of the individual sequences/texts in the collection
556  // before position `location + 1` and thereby also the number of delimiters.
557  size_type const rank = index->text_begin_rs.rank(location + 1);
558  assert(rank > 0);
559  size_type const text_id = rank - 1;
560 
561  // The start location of the `text_id`-th text in the sequence (position of the `rank`-th 1 in the bitvector).
562  size_type const start_location = index->text_begin_ss.select(rank);
563  // Substract lengths of previous sequences.
564  size_type const query_begin = location - start_location;
565 
566  // Take subtext, slice query out of it
567  return text[text_id] | views::slice(query_begin, query_begin + query_length());
568  }
569 
581  size_type count() const noexcept
582  {
583  assert(index != nullptr);
584 
585  return 1 + node.rb - node.lb;
586  }
587 
601  requires (index_t::text_layout_mode == text_layout::single)
603  {
604  assert(index != nullptr);
605 
606  locate_result_type occ{};
607  occ.reserve(count());
608  for (size_type i = 0; i < count(); ++i)
609  {
610  occ.emplace_back(0, offset() - index->index[node.lb + i]);
611  }
612 
613  return occ;
614  }
615 
619  requires (index_t::text_layout_mode == text_layout::collection)
621  {
622  assert(index != nullptr);
623 
624  locate_result_type occ;
625  occ.reserve(count());
626  for (size_type i = 0; i < count(); ++i)
627  {
628  size_type loc = offset() - index->index[node.lb + i];
629  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
630  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
631  occ.emplace_back(sequence_rank - 1, sequence_position);
632  }
633  return occ;
634  }
635 
648  auto lazy_locate() const
650  requires (index_t::text_layout_mode == text_layout::single)
652  {
653  assert(index != nullptr);
654 
655  return std::views::iota(node.lb, node.lb + count())
656  | std::views::transform([*this, _offset = offset()] (auto sa_pos)
657  {
658  return locate_result_value_type{0u, _offset - index->index[sa_pos]};
659  });
660  }
661 
663  auto lazy_locate() const
665  requires (index_t::text_layout_mode == text_layout::collection)
667  {
668  assert(index != nullptr);
669 
670  return std::views::iota(node.lb, node.lb + count())
671  | std::views::transform([*this, _offset = offset()] (auto sa_pos)
672  {
673  auto loc = _offset - index->index[sa_pos];
674  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
675  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
676  return locate_result_value_type{sequence_rank - 1, sequence_position};
677  });
678  }
679 
687  template <cereal_archive archive_t>
688  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
689  {
690  archive(parent_lb);
691  archive(parent_rb);
692  archive(node);
693  archive(sigma);
694  }
696 };
697 
698 } // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
Provides alphabet adaptations for standard char types.
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:87
bool extend_right(seq_t &&seq) noexcept
Tries to extend the query by seq to the right.
Definition: fm_index_cursor.hpp:352
locate_result_type locate() const
Locates the occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:599
bool extend_right(char_t const c) noexcept
Tries to extend the query by the character c to the right.
Definition: fm_index_cursor.hpp:303
auto path_label(text_t &&text) const noexcept
Returns the searched query.
Definition: fm_index_cursor.hpp:524
bool cycle_back() noexcept
Tries to replace the rightmost character of the query by the next lexicographically larger character ...
Definition: fm_index_cursor.hpp:413
bool operator!=(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:239
seqan3::suffix_array_interval suffix_array_interval() const noexcept
Returns the half-open suffix array interval.
Definition: fm_index_cursor.hpp:473
bool operator==(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:217
size_type count() const noexcept
Counts the number of occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:581
size_type last_rank() const noexcept
Outputs the rightmost rank.
Definition: fm_index_cursor.hpp:450
bool extend_right() noexcept
Tries to extend the query by the smallest possible character to the right such that the query is foun...
Definition: fm_index_cursor.hpp:263
size_type query_length() const noexcept
Returns the length of the searched query.
Definition: fm_index_cursor.hpp:498
index_t index_type
Type of the index.
Definition: fm_index_cursor.hpp:93
typename index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index_cursor.hpp:95
auto lazy_locate() const
Locates the occurrences of the searched query in the text on demand, i.e. a std::ranges::view is retu...
Definition: fm_index_cursor.hpp:648
bool extend_right(char_type const *cstring) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: fm_index_cursor.hpp:330
fm_index_cursor() noexcept=default
Default constructor. Accessing member functions on a default constructed object is undefined behavior...
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
T emplace_back(T... args)
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
typename range_innermost_value< t >::type range_innermost_value_t
Shortcut for seqan3::range_innermost_value (transformation_trait shortcut).
Definition: type_traits.hpp:91
@ seq
The "sequence", usually a range of nucleotides or amino acids.
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: concept.hpp:72
@ single
The text is a single range.
Definition: concept.hpp:74
@ collection
The text is a range of ranges.
Definition: concept.hpp:76
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:188
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Adaptations of concepts from the Ranges TS.
T reserve(T... args)
Provides the concept for seqan3::detail::sdsl_index.
Provides seqan3::views::slice.
The underlying suffix array interval.
Definition: fm_index_cursor.hpp:33
size_t end_position
The exclusive end position of the interval ("right boundary").
Definition: fm_index_cursor.hpp:37
size_t begin_position
The begin position of the interval ("left boundary").
Definition: fm_index_cursor.hpp:35
friend bool operator==(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for equality.
Definition: fm_index_cursor.hpp:47
friend bool operator!=(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for inequality.
Definition: fm_index_cursor.hpp:57