BamTools  2.5.1
BamAlignment.h
Go to the documentation of this file.
1 // ***************************************************************************
2 // BamAlignment.h (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 25 July 2013 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
9 
10 #ifndef BAMALIGNMENT_H
11 #define BAMALIGNMENT_H
12 
13 #include <cstddef>
14 #include <cstdlib>
15 #include <cstring>
16 #include <string>
17 #include <vector>
18 #include "api/BamAux.h"
19 #include "api/BamConstants.h"
20 #include "api/api_global.h"
21 
22 namespace BamTools {
23 
25 // forward declaration of BamAlignment's "friends"
26 namespace Internal {
27 class BamReaderPrivate;
28 class BamWriterPrivate;
29 } // namespace Internal
31 
32 // BamAlignment data structure
34 {
35 
36  // constructors & destructor
37 public:
38  BamAlignment();
39  BamAlignment(const BamAlignment& other);
40  ~BamAlignment();
41 
42  // queries against alignment flags
43 public:
44  bool IsDuplicate() const; // returns true if this read is a PCR duplicate
45  bool IsFailedQC() const; // returns true if this read failed quality control
46  bool IsFirstMate() const; // returns true if alignment is first mate on read
47  bool IsMapped() const; // returns true if alignment is mapped
48  bool IsMateMapped() const; // returns true if alignment's mate is mapped
49  bool IsMateReverseStrand() const; // returns true if alignment's mate mapped to reverse strand
50  bool IsPaired() const; // returns true if alignment part of paired-end read
51  bool IsPrimaryAlignment() const; // returns true if reported position is primary alignment
52  bool IsProperPair()
53  const; // returns true if alignment is part of read that satisfied paired-end resolution
54  bool IsReverseStrand() const; // returns true if alignment mapped to reverse strand
55  bool IsSecondMate() const; // returns true if alignment is second mate on read
56  bool IsSupplementaryAlignment(void) const; // Additional patch by tiddit package
57 
58  // manipulate alignment flags
59 public:
60  void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
61  void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
62  void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
63  void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
64  void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
65  void SetIsMateReverseStrand(
66  bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
67  void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
68  void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
69  void SetIsProperPair(
70  bool
71  ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
72  void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
73  void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
74 
75  // tag data access methods
76 public:
77  // add a new tag
78  template <typename T>
79  bool AddTag(const std::string& tag, const std::string& type, const T& value);
80  template <typename T>
81  bool AddTag(const std::string& tag, const std::vector<T>& values);
82 
83  // edit (or append) tag
84  template <typename T>
85  bool EditTag(const std::string& tag, const std::string& type, const T& value);
86  template <typename T>
87  bool EditTag(const std::string& tag, const std::vector<T>& values);
88 
89  // retrieves tag data
90  template <typename T>
91  bool GetTag(const std::string& tag, T& destination) const;
92  template <typename T>
93  bool GetTag(const std::string& tag, std::vector<T>& destination) const;
94 
95  // retrieves all current tag names
96  std::vector<std::string> GetTagNames() const;
97 
98  // retrieves the SAM/BAM type-code for requested tag name
99  bool GetTagType(const std::string& tag, char& type) const;
100 
101  // retrieves the SAM/BAM type-code for the data elements in an array tag
102  bool GetArrayTagType(const std::string& tag, char& type) const;
103 
104  // returns true if alignment has a record for this tag name
105  bool HasTag(const std::string& tag) const;
106 
107  // removes a tag
108  void RemoveTag(const std::string& tag);
109 
110  // additional methods
111 public:
112  // populates alignment string fields
113  bool BuildCharData();
114 
115  // calculates alignment end position
116  int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
117 
118  // returns a description of the last error that occurred
119  std::string GetErrorString() const;
120 
121  // retrieves the size, read locations and reference locations of soft-clip operations
122  bool GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions,
123  std::vector<int>& genomePositions, bool usePadded = false) const;
124 
125  // public data fields
126 public:
127  std::string Name; // read name
128  int32_t Length; // length of query sequence
129  std::string QueryBases; // 'original' sequence (contained in BAM file)
130  std::string
131  AlignedBases; // 'aligned' sequence (QueryBases plus deletion, padding, clipping chars)
132  std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
133  std::string TagData; // tag data (use provided methods to query/modify)
134  int32_t RefID; // ID number for reference sequence
135  int32_t Position; // position (0-based) where alignment starts
136  uint16_t Bin; // BAM (standard) index bin number for this alignment
137  uint16_t MapQuality; // mapping quality score
138  uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
139  std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
140  int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
141  int32_t MatePosition; // position (0-based) where alignment's mate starts
142  int32_t InsertSize; // mate-pair insert size
143  std::string Filename; // name of BAM file which this alignment comes from
144 
146  // internal utility methods
147 private:
148  bool FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength,
149  unsigned int& numBytesParsed) const;
150  bool IsValidSize(const std::string& tag, const std::string& type) const;
151  void SetErrorString(const std::string& where, const std::string& what) const;
152  bool SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const;
154  // internal data
155 private:
156  struct BamAlignmentSupportData
157  {
158 
160  // data members
161  std::string AllCharData;
162  uint32_t BlockLength;
163  uint32_t NumCigarOperations;
164  uint32_t QueryNameLength;
165  uint32_t QuerySequenceLength;
166  bool HasCoreOnly;
167 
169  // constructor
170  BamAlignmentSupportData()
171  : BlockLength(0)
172  , NumCigarOperations(0)
173  , QueryNameLength(0)
174  , QuerySequenceLength(0)
175  , HasCoreOnly(false)
176  {}
177  };
178  BamAlignmentSupportData SupportData;
179  friend class Internal::BamReaderPrivate;
180  friend class Internal::BamWriterPrivate;
181 
182  mutable std::string ErrorString; // mutable to allow updates even in logically const methods
183 };
184 
185 // ---------------------------------------------------------
186 // BamAlignment tag access methods
187 
199 template <typename T>
200 inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value)
201 {
202 
203  // if char data not populated, do that first
204  if (SupportData.HasCoreOnly) BuildCharData();
205 
206  // check tag/type size
207  if (!IsValidSize(tag, type)) {
208  // TODO: set error string?
209  return false;
210  }
211 
212  // check that storage type code is OK for T
213  if (!TagTypeHelper<T>::CanConvertTo(type.at(0))) {
214  // TODO: set error string?
215  return false;
216  }
217 
218  // localize the tag data
219  char* pTagData = (char*)TagData.data();
220  const unsigned int tagDataLength = TagData.size();
221  unsigned int numBytesParsed = 0;
222 
223  // if tag already exists, return false
224  // use EditTag explicitly instead
225  if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
226  // TODO: set error string?
227  return false;
228  }
229 
230  // otherwise, convert value to string
231  union
232  {
233  T value;
234  char valueBuffer[sizeof(T)];
235  } un;
236  un.value = value;
237 
238  // copy original tag data to temp buffer
239  const std::string newTag = tag + type;
240  const std::size_t newTagDataLength =
241  tagDataLength + newTag.size() + sizeof(T); // leave room for new T
242  RaiiBuffer originalTagData(newTagDataLength);
243  memcpy(originalTagData.Buffer, TagData.c_str(),
244  tagDataLength + 1); // '+1' for TagData null-term
245 
246  // append newTag
247  strcat(originalTagData.Buffer + tagDataLength, newTag.data());
248  memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
249 
250  // store temp buffer back in TagData
251  const char* newTagData = (const char*)originalTagData.Buffer;
252  TagData.assign(newTagData, newTagDataLength);
253  return true;
254 }
255 
256 template <>
257 inline bool BamAlignment::AddTag<std::string>(const std::string& tag, const std::string& type,
258  const std::string& value)
259 {
260  // if char data not populated, do that first
261  if (SupportData.HasCoreOnly) BuildCharData();
262 
263  // check tag/type size
264  if (!IsValidSize(tag, type)) {
265  // TODO: set error string?
266  return false;
267  }
268 
269  // check that storage type code is OK for string
270  if (!TagTypeHelper<std::string>::CanConvertTo(type.at(0))) {
271  // TODO: set error string?
272  return false;
273  }
274 
275  // localize the tag data
276  char* pTagData = (char*)TagData.data();
277  const unsigned int tagDataLength = TagData.size();
278  unsigned int numBytesParsed = 0;
279 
280  // if tag already exists, return false
281  // use EditTag explicitly instead
282  if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
283  // TODO: set error string?
284  return false;
285  }
286 
287  // otherwise, copy tag data to temp buffer
288  const std::string newTag = tag + type + value;
289  const std::size_t newTagDataLength =
290  tagDataLength + newTag.size() + 1; // leave room for null-term
291  RaiiBuffer originalTagData(newTagDataLength);
292  memcpy(originalTagData.Buffer, TagData.c_str(),
293  tagDataLength + 1); // '+1' for TagData null-term
294 
295  // append newTag (removes original null-term, then appends newTag + null-term)
296  strcat(originalTagData.Buffer + tagDataLength, newTag.data());
297 
298  // store temp buffer back in TagData
299  const char* newTagData = (const char*)originalTagData.Buffer;
300  TagData.assign(newTagData, newTagDataLength);
301  return true;
302 }
303 
314 template <typename T>
315 inline bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values)
316 {
317 
318  // if char data not populated, do that first
319  if (SupportData.HasCoreOnly) BuildCharData();
320 
321  // check for valid tag name length
322  if (tag.size() != Constants::BAM_TAG_TAGSIZE) return false;
323 
324  // localize the tag data
325  char* pTagData = (char*)TagData.data();
326  const unsigned int tagDataLength = TagData.size();
327  unsigned int numBytesParsed = 0;
328 
329  // if tag already exists, return false
330  // use EditTag explicitly instead
331  if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
332  // TODO: set error string?
333  return false;
334  }
335 
336  // build new tag's base information
337  char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
338  memcpy(newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE);
339  newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
340  newTagBase[3] = TagTypeHelper<T>::TypeCode();
341 
342  // add number of array elements to newTagBase
343  const int32_t numElements = values.size();
344  memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
345 
346  // copy current TagData string to temp buffer, leaving room for new tag's contents
347  const std::size_t newTagDataLength =
348  tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE + numElements * sizeof(T);
349  RaiiBuffer originalTagData(newTagDataLength);
350  memcpy(originalTagData.Buffer, TagData.c_str(),
351  tagDataLength + 1); // '+1' for TagData's null-term
352 
353  // write newTagBase (removes old null term)
354  strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
355 
356  // add vector elements to tag
357  int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
358  for (int i = 0; i < numElements; ++i) {
359  const T& value = values.at(i);
360  memcpy(originalTagData.Buffer + elementsBeginOffset + i * sizeof(T), &value, sizeof(T));
361  }
362 
363  // store temp buffer back in TagData
364  const char* newTagData = (const char*)originalTagData.Buffer;
365  TagData.assign(newTagData, newTagDataLength);
366  return true;
367 }
368 
383 template <typename T>
384 inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value)
385 {
386 
387  // if char data not populated, do that first
388  if (SupportData.HasCoreOnly) BuildCharData();
389 
390  // remove existing tag if present, then append tag with new value
391  if (HasTag(tag)) RemoveTag(tag);
392  return AddTag(tag, type, value);
393 }
394 
406 template <typename T>
407 inline bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values)
408 {
409 
410  // if char data not populated, do that first
411  if (SupportData.HasCoreOnly) BuildCharData();
412 
413  // remove existing tag if present, then append tag with new values
414  if (HasTag(tag)) RemoveTag(tag);
415  return AddTag(tag, values);
416 }
417 
425 template <typename T>
426 inline bool BamAlignment::GetTag(const std::string& tag, T& destination) const
427 {
428 
429  // skip if alignment is core-only
430  if (SupportData.HasCoreOnly) {
431  // TODO: set error string?
432  return false;
433  }
434 
435  // skip if no tags present
436  if (TagData.empty()) {
437  // TODO: set error string?
438  return false;
439  }
440 
441  // localize the tag data
442  char* pTagData = (char*)TagData.data();
443  const unsigned int tagDataLength = TagData.size();
444  unsigned int numBytesParsed = 0;
445 
446  // return failure if tag not found
447  if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
448  // TODO: set error string?
449  return false;
450  }
451 
452  // fetch data type
453  const char type = *(pTagData - 1);
454  if (!TagTypeHelper<T>::CanConvertFrom(type)) {
455  // TODO: set error string ?
456  return false;
457  }
458 
459  // determine data length
460  int destinationLength = 0;
461  switch (type) {
462 
463  // 1 byte data
467  destinationLength = 1;
468  break;
469 
470  // 2 byte data
473  destinationLength = 2;
474  break;
475 
476  // 4 byte data
480  destinationLength = 4;
481  break;
482 
483  // var-length types not supported for numeric destination
487  SetErrorString("BamAlignment::GetTag",
488  "cannot store variable length tag data into a numeric destination");
489  return false;
490 
491  // unrecognized tag type
492  default:
493  const std::string message = std::string("invalid tag type: ") + type;
494  SetErrorString("BamAlignment::GetTag", message);
495  return false;
496  }
497 
498  // store data in destination
499  destination = 0;
500  memcpy(&destination, pTagData, destinationLength);
501 
502  // return success
503  return true;
504 }
505 
506 template <>
507 inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
508  std::string& destination) const
509 {
510  // skip if alignment is core-only
511  if (SupportData.HasCoreOnly) {
512  // TODO: set error string?
513  return false;
514  }
515 
516  // skip if no tags present
517  if (TagData.empty()) {
518  // TODO: set error string?
519  return false;
520  }
521 
522  // localize the tag data
523  char* pTagData = (char*)TagData.data();
524  const unsigned int tagDataLength = TagData.size();
525  unsigned int numBytesParsed = 0;
526 
527  // return failure if tag not found
528  if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
529  // TODO: set error string?
530  return false;
531  }
532 
533  // otherwise copy data into destination
534  const unsigned int dataLength = strlen(pTagData);
535  destination.clear();
536  destination.resize(dataLength);
537  memcpy((char*)destination.data(), pTagData, dataLength);
538 
539  // return success
540  return true;
541 }
542 
550 template <typename T>
551 inline bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const
552 {
553 
554  // skip if alignment is core-only
555  if (SupportData.HasCoreOnly) {
556  // TODO: set error string?
557  return false;
558  }
559 
560  // skip if no tags present
561  if (TagData.empty()) {
562  // TODO: set error string?
563  return false;
564  }
565 
566  // localize the tag data
567  char* pTagData = (char*)TagData.data();
568  const unsigned int tagDataLength = TagData.size();
569  unsigned int numBytesParsed = 0;
570 
571  // return false if tag not found
572  if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
573  // TODO: set error string?
574  return false;
575  }
576 
577  // check that tag is array type
578  const char tagType = *(pTagData - 1);
579  if (tagType != Constants::BAM_TAG_TYPE_ARRAY) {
580  SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
581  return false;
582  }
583 
584  // fetch element type
585  const char elementType = *pTagData;
586  if (!TagTypeHelper<T>::CanConvertFrom(elementType)) {
587  // TODO: set error string ?
588  return false;
589  }
590  ++pTagData;
591 
592  // calculate length of each element in tag's array
593  switch (elementType) {
597  break;
598 
601  break;
602 
606  break;
607 
608  // var-length types not supported for numeric destination
612  SetErrorString("BamAlignment::GetTag",
613  "invalid array data, variable-length elements are not allowed");
614  return false;
615 
616  // unknown tag type
617  default:
618  const std::string message = std::string("invalid array element type: ") + elementType;
619  SetErrorString("BamAlignment::GetTag", message);
620  return false;
621  }
622 
623  // get number of elements
624  int32_t numElements;
625  memcpy(&numElements, pTagData, sizeof(int32_t));
626  pTagData += 4;
627  destination.clear();
628  destination.reserve(numElements);
629 
630  // read in elements
631  T value;
632  for (int i = 0; i < numElements; ++i) {
633  memcpy(&value, pTagData, sizeof(T));
634  pTagData += sizeof(T);
635  destination.push_back(value);
636  }
637 
638  // return success
639  return true;
640 }
641 
642 typedef std::vector<BamAlignment> BamAlignmentVector;
643 
644 } // namespace BamTools
645 
646 #endif // BAMALIGNMENT_H
#define API_EXPORT
Definition: api_global.h:18
The main BAM alignment data structure.
Definition: BamAlignment.h:34
bool BuildCharData()
Populates alignment string fields (read name, bases, qualities, tag data).
Definition: BamAlignment.cpp:135
int32_t InsertSize
mate-pair insert size
Definition: BamAlignment.h:142
bool GetTag(const std::string &tag, T &destination) const
Definition: BamAlignment.h:426
int32_t Length
length of query sequence
Definition: BamAlignment.h:128
uint32_t AlignmentFlag
alignment bit-flag (use the provided methods to query/modify)
Definition: BamAlignment.h:138
bool AddTag(const std::string &tag, const std::string &type, const T &value)
Definition: BamAlignment.h:200
std::string AlignedBases
'aligned' sequence (includes any indels, padding, clipping)
Definition: BamAlignment.h:131
int32_t RefID
ID number for reference sequence.
Definition: BamAlignment.h:134
std::string Name
read name
Definition: BamAlignment.h:127
uint16_t MapQuality
mapping quality score
Definition: BamAlignment.h:137
std::string Qualities
FASTQ qualities (ASCII characters, not numeric values)
Definition: BamAlignment.h:132
uint16_t Bin
BAM (standard) index bin number for this alignment.
Definition: BamAlignment.h:136
int32_t MatePosition
position (0-based) where alignment's mate starts
Definition: BamAlignment.h:141
std::string TagData
tag data (use the provided methods to query/modify)
Definition: BamAlignment.h:133
std::string Filename
name of BAM file which this alignment comes from
Definition: BamAlignment.h:143
std::vector< CigarOp > CigarData
CIGAR operations for this alignment.
Definition: BamAlignment.h:139
int32_t MateRefID
ID number for reference sequence where alignment's mate was aligned.
Definition: BamAlignment.h:140
bool HasTag(const std::string &tag) const
Returns true if alignment has a record for requested tag.
Definition: BamAlignment.cpp:722
int32_t Position
position (0-based) where alignment starts
Definition: BamAlignment.h:135
void RemoveTag(const std::string &tag)
Removes field from BAM tags.
Definition: BamAlignment.cpp:852
std::string QueryBases
'original' sequence (as reported from sequencing machine)
Definition: BamAlignment.h:129
bool EditTag(const std::string &tag, const std::string &type, const T &value)
Definition: BamAlignment.h:384
const char BAM_TAG_TYPE_UINT8
Definition: BamConstants.h:76
const char BAM_TAG_TYPE_HEX
Definition: BamConstants.h:83
const char BAM_TAG_TYPE_INT32
Definition: BamConstants.h:79
const char BAM_TAG_TYPE_ASCII
Definition: BamConstants.h:74
const uint8_t BAM_TAG_TAGSIZE
Definition: BamConstants.h:86
const char BAM_TAG_TYPE_ARRAY
Definition: BamConstants.h:84
const char BAM_TAG_TYPE_FLOAT
Definition: BamConstants.h:81
const char BAM_TAG_TYPE_UINT32
Definition: BamConstants.h:80
const char BAM_TAG_TYPE_STRING
Definition: BamConstants.h:82
const char BAM_TAG_TYPE_INT8
Definition: BamConstants.h:75
const char BAM_TAG_TYPE_UINT16
Definition: BamConstants.h:78
const char BAM_TAG_TYPE_INT16
Definition: BamConstants.h:77
const uint8_t BAM_TAG_ARRAYBASE_SIZE
Definition: BamConstants.h:88
Contains all BamTools classes & methods.
Definition: Sort.h:24
std::vector< BamAlignment > BamAlignmentVector
Definition: BamAlignment.h:642