-
Notifications
You must be signed in to change notification settings - Fork 7
/
vg.proto
373 lines (324 loc) · 15.9 KB
/
vg.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
syntax = "proto3";
import "google/protobuf/struct.proto";
package vg;
// *Graphs* are collections of nodes and edges.
// They can represent subgraphs of larger graphs
// or be wholly-self-sufficient.
// Protobuf memory limits of 67108864 bytes mean we typically keep the size
// of them small generating graphs as collections of smaller subgraphs.
//
message Graph {
repeated Node node = 1; // The `Node`s that make up the graph.
repeated Edge edge = 2; // The `Edge`s that connect the `Node`s in the graph.
repeated Path path = 3; // A set of named `Path`s that visit sequences of oriented `Node`s.
}
// *Nodes* store sequence data.
message Node {
string sequence = 1; // Sequence of DNA bases represented by the Node.
string name = 2; // A name provides an identifier.
int64 id = 3; // Each Node has a unique positive nonzero ID within its Graph.
}
// *Edges* describe linkages between nodes. They are bidirected, connecting the
// end (default) or start of the "from" node to the start (default) or end of
// the "to" node.
//
message Edge {
int64 from = 1; // ID of upstream node.
int64 to = 2; // ID of downstream node.
bool from_start = 3; // If the edge leaves from the 5' (start) of a node.
bool to_end = 4; // If the edge goes to the 3' (end) of a node.
int32 overlap = 5; // Length of overlap between the connected `Node`s.
}
// Edits describe how to generate a new string from elements
// in the graph. To determine the new string, just walk the series of edits,
// stepping from_length distance in the basis node, and to_length in the
// novel element, replacing from_length in the basis node with the sequence.
//
//
// There are several types of Edit:
// - *matches*: from_length == to_length; sequence is empty
// - *snps*: from_length == to_length; sequence = alt
// - *deletions*: to_length == 0 && from_length > to_length; sequence is empty
// - *insertions*: from_length < to_length; sequence = alt
//
message Edit {
int32 from_length = 1; // Length in the target/ref sequence that is removed.
int32 to_length = 2; // Length in read/alt of the sequence it is replaced with.
string sequence = 3; // The replacement sequence, if different from the original sequence.
}
// A Mapping defines the relationship between a node in system and another entity.
// An empty edit list implies complete match, however it is preferred to specify the full edit structure.
// as it is more complex to handle special cases.
//
message Mapping {
Position position = 1; // The position at which the first Edit, if any, in the Mapping starts. Inclusive.
repeated Edit edit = 2; // The series of `Edit`s to transform to region in read/alt.
int64 rank = 5; // The 1-based rank of the mapping in its containing path.
}
// A position in the graph is a node, direction, and offset.
// The node is stored by ID, and the offset is 0-based and counts from the start of the node in the specified orientation.
// The direction specifies which orientation of the node we are considering, the forward (as stored) or reverse complement.
//
// Example:
//
// seq+ G A T T A C A
// offset+ → 0 1 2 3 4 5 6 7
//
// seq- C T A A T G T
// offset- → 0 1 2 3 4 5 6 7
//
// Or both at once:
//
// offset- 7 6 5 4 3 2 1 0 ←
// seq+ G A T T A C A
// offset+ → 0 1 2 3 4 5 6 7
//
// A Position can also, with the `name` and `offset` fields, be used to represent a distance along a named `Path`.
// TODO: Is this an appropriate hack? Or should we add a new message?
message Position {
int64 node_id = 1; // The Node on which the Position is.
int64 offset = 2; // The offset into that node's sequence at which the Position occurs.
bool is_reverse = 4; // True if we obtain the original sequence of the path by reverse complementing the mappings.
string name = 5; // If the position is used to represent a position against a reference path
}
// Paths are walks through nodes defined by a series of `Edit`s.
// They can be used to represent:
// - haplotypes
// - mappings of reads, or alignments, by including edits
// - relationships between nodes
// - annotations from other data sources, such as:
// genes, exons, motifs, transcripts, peaks
//
message Path {
string name = 1; // The name of the path. Path names starting with underscore (_) are reserved for internal VG use.
repeated Mapping mapping = 2; // The `Mapping`s which describe the order and orientation in which the Path visits `Node`s.
bool is_circular = 3; // Set to true if the path is circular.
int64 length = 4; // Optional length annotation for the Path.
}
// Alignments link query strings, such as other genomes or reads, to Paths.
//
message Alignment {
string sequence = 1; // The sequence that has been aligned.
Path path = 2; // The Path that the sequence follows in the graph it has been aligned to, containing the `Edit`s that modify the graph to produce the sequence.
string name = 3; // The name of the sequence that has been aligned. Similar to read name in BAM.
bytes quality = 4; // The quality scores for the sequence, as values on a 0-255 scale.
int32 mapping_quality = 5; // The mapping quality score for the alignment, in Phreds.
int32 score = 6; // The score for the alignment, in points.
int32 query_position = 7; // The offset in the query at which this Alignment occurs.
reserved 8; // Old field 8 has been removed
string sample_name = 9; // The name of the sample that produced the aligned read.
string read_group = 10; // The name of the read group to which the aligned read belongs.
Alignment fragment_prev = 11; // The previous Alignment in the fragment. Contains just enough information to locate the full Alignment; e.g. contains an Alignment with only a name, or only a graph mapping position.
Alignment fragment_next = 12; // Similarly, the next Alignment in the fragment.
bool is_secondary = 15; // Flag marking the Alignment as secondary. All but one maximal-scoring alignment of a given read in a GAM file must be secondary.
double identity = 16; // Portion of aligned bases that are perfect matches, or 0 if no bases are aligned.
repeated Path fragment = 17; // An estimate of the length of the fragment, if this Alignment is paired.
repeated Locus locus = 18; // The loci that this alignment supports. TODO: get rid of this, we have annotations in our data model again.
repeated Position refpos = 19; // Position of the alignment in reference paths embedded in graph. Each position has a path name, and the Alignment's minimum position along the path as an offset.
// SAMTools-style flags
bool read_paired = 20;
bool read_mapped = 21;
bool mate_unmapped = 22;
bool read_on_reverse_strand = 23;
bool mate_on_reverse_strand = 24;
bool soft_clipped = 25;
bool discordant_insert_size = 26;
double uniqueness = 27; // The fraction of bases in the alignment that are covered by MEMs with <=1 total hits in the graph
double correct = 28; // Correctness metric 1 = perfectly aligned to truth, 0 = not overlapping true alignment
repeated int32 secondary_score = 29; // The ordered list of scores of secondary mappings
double fragment_score = 30; // Score under the given fragment model, assume higher is better
bool mate_mapped_to_disjoint_subgraph = 31;
string fragment_length_distribution = 32; // The fragment length distribution under which a paired-end alignment was aligned.
reserved 33 to 34; // Haplotype-scoring-related fields migrated to the annotation system.
double time_used = 35; // The time this alignment took
Position to_correct = 36; // A path/offset/orientation pair specifying the distance to the correct alignment
bool correctly_mapped = 37; // This can be set to true to annotate the Alignment as having been mapped correctly.
google.protobuf.Struct annotation = 100; // Annotations carried along with the Alignment.
}
// A subgraph of the unrolled Graph in which each non-branching path is associated with an alignment
// of part of the read and part of the graph such that any path through the MultipathAlignment
// indicates a valid alignment of a read to the graph
message MultipathAlignment {
string sequence = 1;
bytes quality = 2;
string name = 3;
string sample_name = 4;
string read_group = 5;
// non-branching paths of the multipath alignment, each containing an alignment of part of
// the sequence to a Graph
// IMPORTANT: downstream applications will assume these are stored in topological order
repeated Subpath subpath = 6;
// -10 * log_10(probability of mismapping)
int32 mapping_quality = 7;
// optional: indices of Subpaths that align the beginning of the read (i.e. source nodes)
repeated uint32 start = 8;
string paired_read_name = 9;
google.protobuf.Struct annotation = 100; // Annotations carried along with the Alignment.
}
// A non-branching path of a MultipathAlignment
message Subpath {
// describes node sequence and edits to the graph sequences
Path path = 1;
// the indices of subpaths in the multipath alignment that are to the right of this path
// where right is in the direction of the end of the read sequence
repeated uint32 next = 2;
// score of this subpath's alignment
int32 score = 3;
// connections to other subpaths that are not necessarily contiguous in the graph
repeated Connection connection = 4;
}
// An edge in a MultipathAlignment between Subpaths that may not be contiguous in the graph
message Connection {
// the index of the Subpath that this connection points to
uint32 next = 1;
// the score of this connection
int32 score = 2;
}
// Used to serialize kmer matches.
message KmerMatch {
string sequence = 1;
int64 node_id = 2;
sint32 position = 3;
// If true, this kmer is backwards relative to its node, and position counts from the end of the node.
bool backward = 4;
}
// Summarizes reads that map to single position in the graph.
// This structure is pretty much identical to a line in Samtools pileup format
// if qualities set, it must have size = num_bases
message BasePileup {
int32 ref_base = 1;
int32 num_bases = 2;
string bases = 3;
bytes qualities = 4;
}
// Collect pileup records by node. Saves some space and hashing over
// storing individually, assuming not too sparse and avg. node length more than couple bases
// the ith BasePileup in the array corresponds to the position at offset i.
message NodePileup {
int64 node_id = 1;
repeated BasePileup base_pileup = 2;
}
// Keep pileup-like record for reads that span edges
message EdgePileup {
Edge edge = 1;
int32 num_reads = 2; // total reads mapped
int32 num_forward_reads = 3; // number of reads mapped on forward strand
bytes qualities = 4;
}
// Bundle up Node and Edge pileups
message Pileup {
repeated NodePileup node_pileups = 1;
repeated EdgePileup edge_pileups = 2;
}
// Enumeration of the classifications of snarls
enum SnarlType {
UNCLASSIFIED = 0;
ULTRABUBBLE = 1;
UNARY = 2;
}
// Describes a subgraph that is connected to the rest of the graph by two nodes.
message Snarl {
// What type of snarl is this?
SnarlType type = 1;
// Visits that connect the Snarl to the rest of the graph
Visit start = 2; // points *INTO* the snarl
Visit end = 3; // points *OUT OF* the snarl
// If this Snarl is nested in another, this field should be filled in with a Snarl
// that has the start and end visits filled in (other information is optional/extraneous)
Snarl parent = 4;
// Allows snarls to be named, e.g. by the hash of the VCF variant they come from.
string name = 5;
// Indicate whether there is a reversing path contained in the Snarl from either the
// start to itself or the end to itself
bool start_self_reachable = 6;
bool end_self_reachable = 7;
// Indicate whether the start of the Snarl is connected through to the end.
bool start_end_reachable = 8;
// Indicate whether the snarl's net graph is free of directed cycles
bool directed_acyclic_net_graph = 9;
}
// Describes a step of a walk through a Snarl either on a node or through a child Snarl
message Visit {
// The node ID or snarl of this step (only one should be given)
int64 node_id = 1;
Snarl snarl = 2; // only needs to contain the start and end Visits
string name = 4; // The segment name of this step.
// Indicates:
// if node_id is specified reverse complement of node
// if snarl is specified traversal of a child snarl entering backwards through
// end and leaving backwards through start
bool backward = 3;
}
// Describes a walk through a Snarl where each step is given as either a node or
// a child Snarl (leaving the walk through the child Snarl to another SnarlTraversal)
message SnarlTraversal {
// Steps of the walk through a Snarl, including the start and end nodes. If the
// traversal includes a Visit that represents a Snarl, both the node entering the Snarl
// and the node leaving the Snarl should be included in the traversal.
repeated Visit visit = 1;
// The name of the traversal can be used for a variant allele id (e.g. <parentSnarlHash>_0, <parentSnarlHash>_1...
// or by some other arbitrary annotation , unique or non-unique, e.g. deleteterious, gain_of_function, etc., though these
// will be lost in any indices).
string name = 2;
}
// Describes a genetic locus with multiple possible alleles, a genotype, and observational support.
message Locus {
// A locus may have an identifying name.
string name = 1;
// These are all the alleles at the locus, not just the called ones.
// Note that a primary reference allele may or may not appear.
repeated Path allele = 2;
// These supports are per-allele, matching the alleles above
repeated Support support = 3;
// sorted by likelihood or posterior
// the first one is the "call"
repeated Genotype genotype = 4;
// We also have a Support for the locus overall, because reads may have
// supported multiple alleles and we want to know how many total there were.
Support overall_support = 5;
// We track the likelihood of each allele individually, in addition to
// genotype likelihoods. Stores the likelihood natural logged.
repeated double allele_log_likelihood = 6;
}
// Describes a genotype at a particular locus.
message Genotype {
// These refer to the offsets of the alleles in the Locus object.
repeated int32 allele = 1;
bool is_phased = 2;
double likelihood = 3;
double log_likelihood = 4; // Likelihood natural logged.
double log_prior = 5; // Prior natural logged.
double log_posterior = 6; // Posterior natural logged (unnormalized).
}
// Aggregates information about the reads supporting an allele.
message Support {
// The overall quality of all the support, as -10 * log10(P(all support is wrong))
double quality = 1;
// The number of supporting reads on the forward strand (which may be fractional)
double forward = 2;
// The number of supporting reads on the reverse strand (which may be fractional)
double reverse = 3;
// TODO: what is this?
double left = 4;
// TODO: What is this?
double right = 5;
}
// Support pinned to a location, which can be either a node or an edge
message LocationSupport {
// The support
Support support = 1;
// The location
oneof oneof_location {
Edge edge = 2;
int64 node_id = 3;
}
}
// Translations map from one graph to another.
// A collection of these provides a covering mapping between a from and to graph.
// If each "from" path through the base graph corresponds to a "to" path in an updated graph,
// then we can use these translations to project positions, mappings, and paths in the new
// graph into the old one using the Translator interface.
message Translation {
Path from = 1;
Path to = 2;
}