Skip to content

Commit 43f205e

Browse files
committed
docu
1 parent e6f581e commit 43f205e

File tree

4 files changed

+108
-77
lines changed

4 files changed

+108
-77
lines changed

include/bio/map_io/all.hpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// -----------------------------------------------------------------------------------------------------
2+
// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3+
// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4+
// Copyright (c) 2020-2021, deCODE Genetics
5+
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
6+
// shipped with this file and also available at: https://github.com/seqan/b.i.o./blob/master/LICENSE
7+
// -----------------------------------------------------------------------------------------------------
8+
9+
/*!\file
10+
* \brief Meta-include that includes the whole Variant I/O module.
11+
* \author Svenja Mehringer <svenja.mehringer AT fu-berlin.de>
12+
*/
13+
14+
#pragma once
15+
16+
/*!\defgroup map_io Map I/O
17+
* \ingroup bio
18+
* \brief Reader and writer for SAM and BAM files.
19+
*
20+
* This module provides high-level APIs to read and write SAM and BAM files.
21+
*
22+
* To read files, have a look at bio::map_io::reader and to write files have a look at bio::map_io::writer.
23+
*
24+
*/
25+
26+
/*!\namespace bio::map_io
27+
* \brief Namespace for the Map I/O module.
28+
*/

include/bio/map_io/header.hpp

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,15 @@ namespace bio::map_io
4040
* \ingroup map_io
4141
* \details
4242
*
43-
* TODO
43+
* Each header line begins with the character `@` followed by one of the two-letter header record type codes
44+
* defined in this section. In the header, each line is tab-delimited and, apart from `@CO` lines, each data field
45+
* follows a format `TAG:VALUE` where TAG is a two-character string that defines the format and content of
46+
* VALUE. Thus header lines match `/^@(HD|SQ|RG|PG)(\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/` or are comment lines staring
47+
* with `@CO` followed by a tab and any character sequence.
48+
* Within each (non-`@CO`) header line, no field tag may appear more than once and the order in which the fields
49+
* appear is not significant.
50+
*
51+
* \sa https://samtools.github.io/hts-specs/SAMv1.pdf
4452
*/
4553
class header
4654
{
@@ -57,7 +65,6 @@ class header
5765
header & operator=(header &&) = default; //!< Defaulted.
5866

5967
/*!\brief Construct from a range of reference ids.
60-
* \param[in] The plain text header.
6168
* \param[in] ref_ids The range over reference ids to redirect the pointer at.
6269
*/
6370
template <typename ref_ids_type> // todo: restrict value type to be std::string_view constructible
@@ -95,6 +102,7 @@ class header
95102
//!\brief The reference sequence names.
96103
std::vector<std::string_view> reference_names;
97104

105+
//!\brief Additional information to the reference sequence (same ordering as `reference_names`).
98106
std::vector<std::tuple<int32_t, std::string>> reference_names_info{};
99107

100108
//!\brief The mapping of reference name to position in the reference_names range and the rnames_info() range.
@@ -103,6 +111,7 @@ class header
103111
//!\brief Whether reference sequence names were given to the header on construction.
104112
bool reference_names_given_on_construction{false};
105113

114+
//!\brief Print a B.I.O warning message with current line number in diagnostic.
106115
/* [[noreturn]] compiler says this returns something...? */ void warning(auto const &... messages) const
107116
{
108117
// if (print_warnings)
@@ -114,22 +123,22 @@ class header
114123
// }
115124
}
116125
public:
117-
/*!\name [@HD] File-level meta data
126+
/*!\name [HD] File-level meta data
118127
* \brief You can directly edit these member variables.
119128
* \{
120129
*/
121-
std::string format_version{}; //!< [@HD VN] The file format version. Note: this is overwritten by our formats on output.
122-
std::string sorting{}; //!< [@HD SO] The sorting of the file. SAM: [unknown, unsorted, queryname, coordinate].
123-
std::string grouping{}; //!< [@HD GO] The grouping of the file. SAM: [none, query, reference].
124-
std::string subsorting{}; //!< [@HD SS] The sub-sorting of the file. SAM: [unknown, unsorted, queryname, coordinate](:[A-Za-z0-9_-]+)+.
130+
std::string format_version{}; //!< [HD VN] The file format version. Note: this is overwritten by our formats on output.
131+
std::string sorting{}; //!< [HD SO] The sorting of the file. SAM: [unknown, unsorted, queryname, coordinate].
132+
std::string grouping{}; //!< [HD GO] The grouping of the file. SAM: [none, query, reference].
133+
std::string subsorting{}; //!< [HD SS] The sub-sorting of the file. SAM: [unknown, unsorted, queryname, coordinate]`(:[A-Za-z0-9_-]+)+`.
125134
//!\}
126135

127-
/*!\name [@SQ] Reference sequence dictionary
136+
/*!\name [SQ] Reference sequence dictionary
128137
* \brief You **CANNOT** directly edit these member variables. Please use the respective modifiers.
129138
* \{
130139
*/
131140

132-
/*!\brief [@SQ SN] Reference sequence names
141+
/*!\brief [SQ SN] Reference sequence names
133142
*
134143
* \details
135144
*
@@ -139,11 +148,11 @@ class header
139148
* 1) Reference id information is provided on construction. In this case, no copy is made but this function
140149
* gives you a reference to the provided range. When reading the header or the records, their reference
141150
* information will be checked against the given input.
142-
* 2) No reference information is provided on construction but the \@SQ tags are present in the header.
151+
* 2) No reference information is provided on construction but the `@SQ` tags are present in the header.
143152
* In this case, the reference id information is extracted from the header and this member function provides
144153
* access to them. When reading the records, their reference id information will be checked against the header
145154
* information.
146-
* 3) No reference information is provided on construction an no \@SQ tags are present in the header.
155+
* 3) No reference information is provided on construction an no `@SQ` tags are present in the header.
147156
* In this case, the reference information is parsed from the records field::ref_id and stored in the header.
148157
* This member function then provides access to the unique list of reference names encountered in the records.
149158
*/
@@ -152,11 +161,11 @@ class header
152161
return reference_names;
153162
}
154163

155-
/*!\brief [@SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information
164+
/*!\brief [SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information
156165
*
157166
* \details
158167
*
159-
* The reference information store the length (\@LN tag) and
168+
* The reference information store the length (`@LN` tag) and
160169
* additional information of each reference sequence in the file. The record
161170
* must then store only the index of the reference.
162171
* The name and length information are required if the header is provided
@@ -166,17 +175,17 @@ class header
166175
*
167176
* The additional information (2nd tuple entry) must model
168177
* the following formatting rules: The information is given in a tab separated
169-
* TAG:VALUE format, where TAG must be one of [AH, AN, AS, m5, SP, UR].
178+
* `TAG:VALUE` format, where TAG must be one of [AH, AN, AS, m5, SP, UR].
170179
* The following information and rules apply for each tag (taken from the SAM specs):
171180
*
172181
* * **AH:** Indicates that this sequence is an alternate locus. The value is the locus in the primary assembly for
173-
* which this sequence is an alternative, in the format 'chr:start-end', 'chr' (if known), or '*' (if
174-
* unknown), where 'chr' is a sequence in the primary assembly. Must not be present on sequences in the
182+
* which this sequence is an alternative, in the format `chr:start-end`, `chr` (if known), or `*` (if
183+
* unknown), where `chr` is a sequence in the primary assembly. Must not be present on sequences in the
175184
* primary assembly.
176185
* * **AN:** Alternative reference sequence names. A comma-separated list of alternative names that tools may use
177186
* when referring to this reference sequence. These alternative names are not used elsewhere within the
178187
* SAM file; in particular, they must not appear in alignment records’ RNAME or RNEXT fields. regular
179-
* expression : name (, name )* where name is [0-9A-Za-z][0-9A-Za-z*+.@ \|-]*
188+
* expression : `name (, name )*` where name is `[0-9A-Za-z][0-9A-Za-z*+.@ \|-]*`.
180189
* * **AS:** Genome assembly identifier.
181190
* * **M5:** MD5 checksum of the sequence. See Section 1.3.1
182191
* * **SP:** Species.
@@ -204,7 +213,7 @@ class header
204213
}
205214
//!\}
206215

207-
/*!\name [@RG] Read groups
216+
/*!\name [RG] Read groups
208217
* \brief You can directly edit these member variables.
209218
* \{
210219
*/
@@ -215,7 +224,7 @@ class header
215224
* The read group list stores the group id and
216225
* additional information of each read group in the file. The record
217226
* may store a RG tag information referencing one of the stored id's.
218-
* The id information is required if the @RG header line is provided.
227+
* The id information is required if the \@RG header line is provided.
219228
*
220229
* The additional information (2nd tuple entry) for the SAM format must follow
221230
* the following formatting rules: The information is given in a tab separated
@@ -225,13 +234,13 @@ class header
225234
* * **BC:** Barcode sequence identifying the sample or library. This value is the expected barcode bases as read by
226235
* the sequencing machine in the absence of errors. If there are several barcodes for the sample/library
227236
* (e.g., one on each end of the template), the recommended implementation concatenates all the barcodes
228-
* separating them with hyphens ('-').
237+
* separating them with hyphens (`-`).
229238
* * **CN:** Name of sequencing center producing the read.
230239
* * **DS:** Description. UTF-8 encoding may be used.
231240
* * **DT:** Date the run was produced (ISO8601 date or date/time).
232241
* * **FO:** Flow order. The array of nucleotide bases that correspond to the nucleotides used for each flow of each
233242
* read. Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other
234-
* characters. Format : /\*\|[ACMGRSVTWYHKDBN]+/
243+
* characters. Format : `/\*\|[ACMGRSVTWYHKDBN]+/`
235244
* * **KS:** The array of nucleotide bases that correspond to the key sequence of each read.
236245
* * **LB:** Library.
237246
* * **PG:** Programs used for processing the read group.
@@ -245,7 +254,7 @@ class header
245254
std::vector<std::pair<std::string, std::string>> read_groups{};
246255
//!\}
247256

248-
/*!\name [@PG] Programm information
257+
/*!\name [PG] Programm information
249258
* \brief You can directly edit these member variables.
250259
* \{
251260
*/
@@ -263,7 +272,7 @@ class header
263272
std::vector<program_info_t> program_infos{}; //!< The list of program information.
264273
//!\}
265274

266-
/*!\name [@CO] Comments
275+
/*!\name [CO] Comments
267276
* \brief You can directly edit these member variables.
268277
* \{
269278
*/
@@ -272,17 +281,16 @@ class header
272281
};
273282

274283
/*!\brief Reads the SAM header.
275-
* \tparam stream_view_type The type of the stream as a view.
276-
* \param[in, out] stream_view The stream view to iterate over.
284+
* \param[in] header_string The full header as a std::string_view.
277285
*
278-
* \throws seqan3::format_error if any unexpected character or format is encountered.
286+
* \throws bio::map_io::format_error if any unexpected character or format is encountered.
279287
*
280288
* \details
281289
*
282290
* Reading the header format is done according to the official
283291
* [SAM format specifications](https://samtools.github.io/hts-specs/SAMv1.pdf).
284292
*
285-
* The function throws a seqan3::format_error if any unknown tag was encountered. It will also fail if the format is
293+
* The function throws a bio::map_io::format_error if any unknown tag was encountered. It will also fail if the format is
286294
* not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual
287295
* error.
288296
*/

include/bio/map_io/sam_flag.hpp

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -35,37 +35,35 @@ namespace bio::map_io
3535
* Adapted from the [SAM specifications](https://samtools.github.io/hts-specs/SAMv1.pdf) are the following additional
3636
* information to some flag values:
3737
* * For each read/contig in a SAM file, it is required that one and only one line associated with the read
38-
* has neither the seqan3::sam_flag::secondary_alignment nor the seqan3::sam_flag::supplementary_alignment flag value
39-
* set (satisfies `FLAG & 0x900 == 0 `). This line is called the **primary alignment** of the read.
40-
* * seqan3::sam_flag::secondary_alignment (bit `0x100`) marks the alignment not to be used in certain analyses when
41-
* the tools in use are aware of this bit. It is typically used to flag alternative mappings when multiple mappings
38+
* has neither the bio::map_io::sam_flag::secondary_alignment nor the bio::map_io::sam_flag::supplementary_alignment
39+
* flag value set (satisfies `FLAG & 0x900 == 0 `). This line is called the **primary alignment** of the read.
40+
* * bio::map_io::sam_flag::secondary_alignment (bit `0x100`) marks the alignment not to be used in certain analyses
41+
* when the tools in use are aware of this bit. It is typically used to flag alternative mappings when multiple mappings
4242
* are presented in a SAM.
43-
* * seqan3::sam_flag::supplementary_alignment (bit `0x800`) indicates that the corresponding alignment line is part
43+
* * bio::map_io::sam_flag::supplementary_alignment (bit `0x800`) indicates that the corresponding alignment line is part
4444
* of a chimeric alignment. If the SAM/BAM file corresponds to long reads (nanopore/pacbio) this happens when
4545
* reads are split before being aligned and the best matching part is marked as primary, while all other aligned
4646
* parts are marked supplementary.
47-
* * seqan3::sam_flag::unmapped (bit `0x4`) is the only reliable place to tell whether the read is unmapped.
48-
* If seqan3::sam_flag::unmapped is set, no assumptions can be made about RNAME, POS, CIGAR, MAPQ, and
49-
* seqan3::sam_flag::proper_pair, seqan3::sam_flag::secondary_alignment, and seqan3::sam_flag::supplementary_alignment
47+
* * bio::map_io::sam_flag::unmapped (bit `0x4`) is the only reliable place to tell whether the read is unmapped.
48+
* If bio::map_io::sam_flag::unmapped is set, no assumptions can be made about RNAME, POS, CIGAR, MAPQ, and
49+
* bio::map_io::sam_flag::proper_pair, bio::map_io::sam_flag::secondary_alignment, and bio::map_io::sam_flag::supplementary_alignment
5050
* (bits `0x2`, `0x100`, and `0x800`).
51-
* * seqan3::sam_flag::on_reverse_strand (bit `0x10`) indicates whether the read sequence has been reverse complemented
52-
* and the quality string is reversed. When bit seqan3::sam_flag::unmapped (`0x4`) is unset, this
53-
* corresponds to the strand to which the segment has been mapped: seqan3::sam_flag::on_reverse_strand (bit `0x10`)
54-
* unset indicates the forward strand, while set indicates the reverse strand. When seqan3::sam_flag::unmapped (`0x4`)
51+
* * bio::map_io::sam_flag::on_reverse_strand (bit `0x10`) indicates whether the read sequence has been reverse complemented
52+
* and the quality string is reversed. When bit bio::map_io::sam_flag::unmapped (`0x4`) is unset, this
53+
* corresponds to the strand to which the segment has been mapped: bio::map_io::sam_flag::on_reverse_strand (bit `0x10`)
54+
* unset indicates the forward strand, while set indicates the reverse strand. When bio::map_io::sam_flag::unmapped (`0x4`)
5555
* is set, this indicates whether the unmapped read is stored in its original orientation as it came off the
5656
* sequencing machine.
57-
* * seqan3::sam_flag::first_in_pair and seqan3::sam_flag::second_in_pair (bits `0x40` and `0x80`) reflect the read
58-
* ordering within each template inherent in the sequencing technology used. If seqan3::sam_flag::first_in_pair and
59-
* seqan3::sam_flag::second_in_pair (`0x40` and `0x80`) are both set, the read is part of a linear template, but it
57+
* * bio::map_io::sam_flag::first_in_pair and bio::map_io::sam_flag::second_in_pair (bits `0x40` and `0x80`) reflect the read
58+
* ordering within each template inherent in the sequencing technology used. If bio::map_io::sam_flag::first_in_pair and
59+
* bio::map_io::sam_flag::second_in_pair (`0x40` and `0x80`) are both set, the read is part of a linear template, but it
6060
* is neither the first nor the last read. If both are unset, the index of the read in the template is unknown.
6161
* This may happen for a non-linear template or when this information is lost during data processing.
62-
* * If seqan3::sam_flag::paired (bit `0x1`) is unset, no assumptions can be made about seqan3::sam_flag::proper_pair,
63-
* seqan3::sam_flag::mate_unmapped, seqan3::sam_flag::mate_on_reverse_strand, seqan3::sam_flag::first_in_pair and
64-
* seqan3::sam_flag::second_in_pair (bits `0x2`, `0x8`, `0x20`, `0x40` and `0x80`).
62+
* * If bio::map_io::sam_flag::paired (bit `0x1`) is unset, no assumptions can be made about bio::map_io::sam_flag::proper_pair,
63+
* bio::map_io::sam_flag::mate_unmapped, bio::map_io::sam_flag::mate_on_reverse_strand, bio::map_io::sam_flag::first_in_pair and
64+
* bio::map_io::sam_flag::second_in_pair (bits `0x2`, `0x8`, `0x20`, `0x40` and `0x80`).
6565
*
6666
* \sa https://broadinstitute.github.io/picard/explain-flags.html
67-
*
68-
* \remark For a complete overview, take a look at \ref io_sam_file
6967
*/
7068
enum class sam_flag : uint16_t
7169
{
@@ -76,27 +74,27 @@ enum class sam_flag : uint16_t
7674
mate_unmapped = 0x8, //!< The mate of this read is not mapped to a reference (unaligned).
7775
on_reverse_strand = 0x10, //!< The read sequence has been reverse complemented before being mapped (aligned).
7876
mate_on_reverse_strand = 0x20, //!< The mate sequence has been reverse complemented before being mapped (aligned).
79-
first_in_pair = 0x40, //!< Indicates the ordering (see details in the seqan3::sam_flag description).
80-
second_in_pair = 0x80, //!< Indicates the ordering (see details in the seqan3::sam_flag description).
77+
first_in_pair = 0x40, //!< Indicates the ordering (see details in the bio::map_io::sam_flag description).
78+
second_in_pair = 0x80, //!< Indicates the ordering (see details in the bio::map_io::sam_flag description).
8179
secondary_alignment = 0x100, //!< This read alignment is an alternative (possibly suboptimal) to the primary.
8280
failed_filter = 0x200, //!< The read alignment failed a filter, e.g. quality controls.
8381
duplicate = 0x400, //!< The read is marked as a PCR duplicate or optical duplicate.
8482
supplementary_alignment = 0x800 //!< This sequence is part of a split alignment and is not the primary alignment.
8583
};
8684

8785
//!\cond DEV
88-
//!\brief Enables bitwise operations for seqan3::sam_flags.
89-
//!\ingroup io_sam_file
90-
//!\sa seqan3::enum_bitwise_operators enables combining enum values.
86+
/*!\brief Enables bitwise operations for bio::map_io::sam_flags.
87+
* \ingroup io_sam_file
88+
* \sa seqan3::enum_bitwise_operators enables combining enum values.
89+
*/
9190
template <>
9291
constexpr bool add_enum_bitwise_operators<sam_flag> = true;
9392
//!\endcond
9493

95-
/*!\brief Overload for the seqan3::sam_flags.
94+
/*!\brief seqan3::debug_stream overload for the bio::map_io::::sam_flags.
9695
* \tparam char_t Type char type of the debug_stream.
9796
* \param stream The seqan3::debug_stream.
9897
* \param flag The flag to print.
99-
* \relates seqan3::debug_stream_type
10098
*/
10199
template <typename char_t>
102100
inline seqan3::debug_stream_type<char_t> & operator<<(seqan3::debug_stream_type<char_t> & stream, sam_flag const flag)

0 commit comments

Comments
 (0)