@@ -40,7 +40,15 @@ namespace bio::map_io
40
40
* \ingroup map_io
41
41
* \details
42
42
*
43
- * TODO
43
+ * Each header line begins with the character `@` followed by one of the two-letter header record type codes
44
+ * defined in this section. In the header, each line is tab-delimited and, apart from `@CO` lines, each data field
45
+ * follows a format `TAG:VALUE` where TAG is a two-character string that defines the format and content of
46
+ * VALUE. Thus header lines match `/^@(HD|SQ|RG|PG)(\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/` or are comment lines staring
47
+ * with `@CO` followed by a tab and any character sequence.
48
+ * Within each (non-`@CO`) header line, no field tag may appear more than once and the order in which the fields
49
+ * appear is not significant.
50
+ *
51
+ * \sa https://samtools.github.io/hts-specs/SAMv1.pdf
44
52
*/
45
53
class header
46
54
{
@@ -57,7 +65,6 @@ class header
57
65
header & operator =(header &&) = default ; // !< Defaulted.
58
66
59
67
/* !\brief Construct from a range of reference ids.
60
- * \param[in] The plain text header.
61
68
* \param[in] ref_ids The range over reference ids to redirect the pointer at.
62
69
*/
63
70
template <typename ref_ids_type> // todo: restrict value type to be std::string_view constructible
@@ -95,6 +102,7 @@ class header
95
102
// !\brief The reference sequence names.
96
103
std::vector<std::string_view> reference_names;
97
104
105
+ // !\brief Additional information to the reference sequence (same ordering as `reference_names`).
98
106
std::vector<std::tuple<int32_t , std::string>> reference_names_info{};
99
107
100
108
// !\brief The mapping of reference name to position in the reference_names range and the rnames_info() range.
@@ -103,6 +111,7 @@ class header
103
111
// !\brief Whether reference sequence names were given to the header on construction.
104
112
bool reference_names_given_on_construction{false };
105
113
114
+ // !\brief Print a B.I.O warning message with current line number in diagnostic.
106
115
/* [[noreturn]] compiler says this returns something...? */ void warning (auto const &... messages) const
107
116
{
108
117
// if (print_warnings)
@@ -114,22 +123,22 @@ class header
114
123
// }
115
124
}
116
125
public:
117
- /* !\name [@ HD] File-level meta data
126
+ /* !\name [HD] File-level meta data
118
127
* \brief You can directly edit these member variables.
119
128
* \{
120
129
*/
121
- std::string format_version{}; // !< [@ HD VN] The file format version. Note: this is overwritten by our formats on output.
122
- std::string sorting{}; // !< [@ HD SO] The sorting of the file. SAM: [unknown, unsorted, queryname, coordinate].
123
- std::string grouping{}; // !< [@ HD GO] The grouping of the file. SAM: [none, query, reference].
124
- std::string subsorting{}; // !< [@ HD SS] The sub-sorting of the file. SAM: [unknown, unsorted, queryname, coordinate](:[A-Za-z0-9_-]+)+.
130
+ std::string format_version{}; // !< [HD VN] The file format version. Note: this is overwritten by our formats on output.
131
+ std::string sorting{}; // !< [HD SO] The sorting of the file. SAM: [unknown, unsorted, queryname, coordinate].
132
+ std::string grouping{}; // !< [HD GO] The grouping of the file. SAM: [none, query, reference].
133
+ std::string subsorting{}; // !< [HD SS] The sub-sorting of the file. SAM: [unknown, unsorted, queryname, coordinate]` (:[A-Za-z0-9_-]+)+` .
125
134
// !\}
126
135
127
- /* !\name [@ SQ] Reference sequence dictionary
136
+ /* !\name [SQ] Reference sequence dictionary
128
137
* \brief You **CANNOT** directly edit these member variables. Please use the respective modifiers.
129
138
* \{
130
139
*/
131
140
132
- /* !\brief [@ SQ SN] Reference sequence names
141
+ /* !\brief [SQ SN] Reference sequence names
133
142
*
134
143
* \details
135
144
*
@@ -139,11 +148,11 @@ class header
139
148
* 1) Reference id information is provided on construction. In this case, no copy is made but this function
140
149
* gives you a reference to the provided range. When reading the header or the records, their reference
141
150
* information will be checked against the given input.
142
- * 2) No reference information is provided on construction but the \ @SQ tags are present in the header.
151
+ * 2) No reference information is provided on construction but the ` @SQ` tags are present in the header.
143
152
* In this case, the reference id information is extracted from the header and this member function provides
144
153
* access to them. When reading the records, their reference id information will be checked against the header
145
154
* information.
146
- * 3) No reference information is provided on construction an no \ @SQ tags are present in the header.
155
+ * 3) No reference information is provided on construction an no ` @SQ` tags are present in the header.
147
156
* In this case, the reference information is parsed from the records field::ref_id and stored in the header.
148
157
* This member function then provides access to the unique list of reference names encountered in the records.
149
158
*/
@@ -152,11 +161,11 @@ class header
152
161
return reference_names;
153
162
}
154
163
155
- /* !\brief [@ SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information
164
+ /* !\brief [SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information
156
165
*
157
166
* \details
158
167
*
159
- * The reference information store the length (\ @LN tag) and
168
+ * The reference information store the length (` @LN` tag) and
160
169
* additional information of each reference sequence in the file. The record
161
170
* must then store only the index of the reference.
162
171
* The name and length information are required if the header is provided
@@ -166,17 +175,17 @@ class header
166
175
*
167
176
* The additional information (2nd tuple entry) must model
168
177
* the following formatting rules: The information is given in a tab separated
169
- * TAG:VALUE format, where TAG must be one of [AH, AN, AS, m5, SP, UR].
178
+ * ` TAG:VALUE` format, where TAG must be one of [AH, AN, AS, m5, SP, UR].
170
179
* The following information and rules apply for each tag (taken from the SAM specs):
171
180
*
172
181
* * **AH:** Indicates that this sequence is an alternate locus. The value is the locus in the primary assembly for
173
- * which this sequence is an alternative, in the format ' chr:start-end', ' chr' (if known), or '*' (if
174
- * unknown), where ' chr' is a sequence in the primary assembly. Must not be present on sequences in the
182
+ * which this sequence is an alternative, in the format ` chr:start-end`, ` chr` (if known), or `*` (if
183
+ * unknown), where ` chr` is a sequence in the primary assembly. Must not be present on sequences in the
175
184
* primary assembly.
176
185
* * **AN:** Alternative reference sequence names. A comma-separated list of alternative names that tools may use
177
186
* when referring to this reference sequence. These alternative names are not used elsewhere within the
178
187
* SAM file; in particular, they must not appear in alignment records’ RNAME or RNEXT fields. regular
179
- * expression : name (, name )* where name is [0-9A-Za-z][0-9A-Za-z*+.@ \|-]*
188
+ * expression : ` name (, name )*` where name is ` [0-9A-Za-z][0-9A-Za-z*+.@ \|-]*`.
180
189
* * **AS:** Genome assembly identifier.
181
190
* * **M5:** MD5 checksum of the sequence. See Section 1.3.1
182
191
* * **SP:** Species.
@@ -204,7 +213,7 @@ class header
204
213
}
205
214
// !\}
206
215
207
- /* !\name [@ RG] Read groups
216
+ /* !\name [RG] Read groups
208
217
* \brief You can directly edit these member variables.
209
218
* \{
210
219
*/
@@ -215,7 +224,7 @@ class header
215
224
* The read group list stores the group id and
216
225
* additional information of each read group in the file. The record
217
226
* may store a RG tag information referencing one of the stored id's.
218
- * The id information is required if the @RG header line is provided.
227
+ * The id information is required if the \ @RG header line is provided.
219
228
*
220
229
* The additional information (2nd tuple entry) for the SAM format must follow
221
230
* the following formatting rules: The information is given in a tab separated
@@ -225,13 +234,13 @@ class header
225
234
* * **BC:** Barcode sequence identifying the sample or library. This value is the expected barcode bases as read by
226
235
* the sequencing machine in the absence of errors. If there are several barcodes for the sample/library
227
236
* (e.g., one on each end of the template), the recommended implementation concatenates all the barcodes
228
- * separating them with hyphens ('-' ).
237
+ * separating them with hyphens (`-` ).
229
238
* * **CN:** Name of sequencing center producing the read.
230
239
* * **DS:** Description. UTF-8 encoding may be used.
231
240
* * **DT:** Date the run was produced (ISO8601 date or date/time).
232
241
* * **FO:** Flow order. The array of nucleotide bases that correspond to the nucleotides used for each flow of each
233
242
* read. Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other
234
- * characters. Format : /\*\|[ACMGRSVTWYHKDBN]+/
243
+ * characters. Format : ` /\*\|[ACMGRSVTWYHKDBN]+/`
235
244
* * **KS:** The array of nucleotide bases that correspond to the key sequence of each read.
236
245
* * **LB:** Library.
237
246
* * **PG:** Programs used for processing the read group.
@@ -245,7 +254,7 @@ class header
245
254
std::vector<std::pair<std::string, std::string>> read_groups{};
246
255
// !\}
247
256
248
- /* !\name [@ PG] Programm information
257
+ /* !\name [PG] Programm information
249
258
* \brief You can directly edit these member variables.
250
259
* \{
251
260
*/
@@ -263,7 +272,7 @@ class header
263
272
std::vector<program_info_t > program_infos{}; // !< The list of program information.
264
273
// !\}
265
274
266
- /* !\name [@ CO] Comments
275
+ /* !\name [CO] Comments
267
276
* \brief You can directly edit these member variables.
268
277
* \{
269
278
*/
@@ -272,17 +281,16 @@ class header
272
281
};
273
282
274
283
/* !\brief Reads the SAM header.
275
- * \tparam stream_view_type The type of the stream as a view.
276
- * \param[in, out] stream_view The stream view to iterate over.
284
+ * \param[in] header_string The full header as a std::string_view.
277
285
*
278
- * \throws seqan3 ::format_error if any unexpected character or format is encountered.
286
+ * \throws bio::map_io ::format_error if any unexpected character or format is encountered.
279
287
*
280
288
* \details
281
289
*
282
290
* Reading the header format is done according to the official
283
291
* [SAM format specifications](https://samtools.github.io/hts-specs/SAMv1.pdf).
284
292
*
285
- * The function throws a seqan3 ::format_error if any unknown tag was encountered. It will also fail if the format is
293
+ * The function throws a bio::map_io ::format_error if any unknown tag was encountered. It will also fail if the format is
286
294
* not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual
287
295
* error.
288
296
*/
0 commit comments