Skip to content

Commit 99d09c0

Browse files
committed
working prototype
1 parent d648a5f commit 99d09c0

File tree

3 files changed

+57
-35
lines changed

3 files changed

+57
-35
lines changed

include/kreeq.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ class DBG : public Kmap<DBG, UserInputKreeq, uint64_t, DBGkmer, DBGkmer32> { //
229229

230230
bool DBGtoVariants(InSegment *inSegment);
231231

232-
std::pair<bool,std::deque<DBGpath>> searchVariants(std::pair<const uint64_t,DBGkmer32> source, std::array<uint16_t, 2> mapRange, phmap::parallel_flat_hash_map<uint64_t,bool> &targetsMap, ParallelMap32* localGraphCache);
232+
std::pair<bool,std::deque<DBGpath>> searchVariants(std::pair<const uint64_t,DBGkmer32> source, std::array<uint16_t, 2> mapRange, const std::deque<const uint64_t> &targetsQueue, const phmap::parallel_flat_hash_map<uint64_t,bool> &targetsMap, ParallelMap32* localGraphCache);
233233

234234
bool variantsToGFA(InSegment *inSegment, Log &threadLog);
235235

src/generate-tests.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@ int main(void) {
3636
const std::set<std::string> excludeFile {"random4.fasta", "random4.fastq", "random5.fasta", "random5.fastq", "random6.fastq", "random7.fastq", "random8.fastq", "random9.fastq", "random10.fastq", "random11.fasta", "random11.fastq", "random12.fasta", "random12.fastq", "to_correct.fasta", "to_correct.fastq", "decompressor1.fasta", "repeat1.fasta", "repeat1.fastq"};
3737

3838
std::vector<std::pair<std::set<std::string>, std::vector<std::string>>> file_args = {
39-
{{"-f testFiles/random1.fasta"}, {"-r testFiles/random3.N.fastq", "-d testFiles/test1.kreeq", "-d testFiles/test2.kreeq"}},
40-
{{"-f testFiles/random4.fasta"}, {"-r testFiles/random4.fastq -k3"}},
41-
{{"-f testFiles/to_correct.fasta"}, {"-r testFiles/to_correct.fastq", "-r testFiles/to_correct.fastq -o gfa", "-r testFiles/to_correct.fastq -o vcf", "-r testFiles/to_correct.fastq -o vcf -p testFiles/random1.anomalies.bed"}}
39+
{{"random1.fasta"}, {"-r testFiles/random3.N.fastq", "-d testFiles/test1.kreeq", "-d testFiles/test2.kreeq"}},
40+
{{"random4.fasta"}, {"-r testFiles/random4.fastq -k3"}},
41+
{{"to_correct.fasta"}, {"-r testFiles/to_correct.fastq", "-r testFiles/to_correct.fastq -o gfa", "-r testFiles/to_correct.fastq -o vcf", "-r testFiles/to_correct.fastq -o vcf -p testFiles/random1.anomalies.bed"}}
4242
// {{set of test inputs}, {list of command line args to run with}}
4343
};
4444

@@ -61,7 +61,7 @@ int main(void) {
6161
if(!fstream) continue;
6262
fstream.close();
6363
for(const std::string &args : pair.second) {
64-
genTest("kreeq", "validate", file, args);
64+
genTest("kreeq", "validate", "-f testFiles/" + file, args);
6565
}
6666
}
6767
}

src/variants.cpp

+52-30
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ bool DBG::DBGtoVariants(InSegment *inSegment) {
7272
while(explored < kcount) {
7373

7474
mapRange = {0,0};
75-
std::queue<uint64_t> targetsQueue;
75+
std::deque<const uint64_t> targetsQueue;
7676
phmap::parallel_flat_hash_map<uint64_t,bool> targetsMap;
7777

7878
while (mapRange[1] < mapCount) {
@@ -86,19 +86,24 @@ bool DBG::DBGtoVariants(InSegment *inSegment) {
8686
bool isFw = false;
8787

8888
for (uint16_t pos = 0; pos < userInput.maxSpan; ++pos) { // populate targets
89-
targetsQueue.push(hash(str+pos));
90-
targetsMap[hash(str+pos)];
89+
if (pos < len-userInput.maxSpan) {
90+
key = hash(str+pos);
91+
targetsQueue.push_back(key);
92+
targetsMap[key];
93+
}
9194
}
9295

9396
for (uint64_t c = 0; c<kcount; ++c){
9497

95-
if(!visited[c]) {
96-
97-
targetsMap.erase(targetsQueue.front());
98-
targetsQueue.pop();
98+
targetsMap.erase(targetsQueue.front());
99+
targetsQueue.pop_front();
100+
if (c < len-userInput.maxSpan) {
99101
key = hash(str+c+userInput.maxSpan);
100102
targetsMap[key];
101-
targetsQueue.push(key);
103+
targetsQueue.push_back(key);
104+
}
105+
106+
if(!visited[c]) {
102107

103108
key = hash(str+c, &isFw);
104109
i = key % mapCount;
@@ -121,14 +126,11 @@ bool DBG::DBGtoVariants(InSegment *inSegment) {
121126
}else{
122127
pair = *it;
123128
}
124-
auto results = searchVariants(pair, mapRange, targetsMap, localGraphCache);
129+
auto results = searchVariants(pair, mapRange, targetsQueue, targetsMap, localGraphCache);
125130
explored += results.first;
126131
if (results.first) {
127-
for (DBGpath &path : results.second) {
128-
path.pos = c;
129-
path.type = SNV;
130-
std::cout<<+path.pos<<" "<<+path.type<<" "<<path.sequence<<std::endl;
131-
}
132+
for (DBGpath &path : results.second)
133+
path.pos = c+k;
132134

133135
if (results.second.size() != 0)
134136
variants.push_back(results.second);
@@ -161,13 +163,13 @@ bool DBG::DBGtoVariants(InSegment *inSegment) {
161163
return true;
162164
}
163165

164-
std::pair<bool,std::deque<DBGpath>> DBG::searchVariants(std::pair<const uint64_t,DBGkmer32> source, std::array<uint16_t, 2> mapRange, phmap::parallel_flat_hash_map<uint64_t,bool> &targetsMap, ParallelMap32* localGraphCache) { // dijkstra variant search
166+
std::pair<bool,std::deque<DBGpath>> DBG::searchVariants(std::pair<const uint64_t,DBGkmer32> source, std::array<uint16_t, 2> mapRange, const std::deque<const uint64_t> &targetsQueue, const phmap::parallel_flat_hash_map<uint64_t,bool> &targetsMap, ParallelMap32* localGraphCache) { // dijkstra variant search
165167

166168
bool explored = false; // true if we reached a node in the original graph
167169
std::vector<uint64_t> destinations;
168170
FibonacciHeap<std::pair<const uint64_t, DBGkmer32>*> Q; // node priority queue Q
169-
phmap::parallel_flat_hash_map<uint64_t,uint8_t> dist;
170-
phmap::parallel_flat_hash_map<uint64_t,std::pair<uint64_t,bool>> prev; // distance table
171+
phmap::parallel_flat_hash_map<uint64_t,uint8_t> dist; // distance table
172+
phmap::parallel_flat_hash_map<uint64_t,std::pair<uint64_t,bool>> prev; // path table
171173
std::deque<DBGpath> discoveredPaths;
172174

173175
dist[source.first] = 1;
@@ -193,7 +195,7 @@ std::pair<bool,std::deque<DBGpath>> DBG::searchVariants(std::pair<const uint64_t
193195
if (startNode == targetsMap.end()) { // if we connect to the original graph we are done
194196
auto nextKmer = localGraphCache->find(key); // check if the node is in the cache (already visited)
195197

196-
if (nextKmer == localGraphCache->end()) { // we cached this node before
198+
if (nextKmer == localGraphCache->end()) { // we did not cache this node before
197199
uint64_t m = key % mapCount;
198200
if (m >= mapRange[0] && m < mapRange[1]) { // the node is in not cached but is available to visit now
199201
map = maps[m];
@@ -234,8 +236,10 @@ std::pair<bool,std::deque<DBGpath>> DBG::searchVariants(std::pair<const uint64_t
234236
bool found = checkNext(key, isFw ? direction : !direction);
235237
if (found) {
236238
++exploredCount;
237-
if (targetsMap.find(key) != targetsMap.end())
238-
destinations.push_back(u->first);
239+
if (key != targetsQueue.front() && targetsMap.find(key) != targetsMap.end()) {
240+
prev[key] = std::make_pair(u->first,direction);
241+
destinations.push_back(key);
242+
}
239243
}
240244
++edgeCount;
241245
}
@@ -252,8 +256,10 @@ std::pair<bool,std::deque<DBGpath>> DBG::searchVariants(std::pair<const uint64_t
252256
bool found = checkNext(key, isFw ? direction : !direction);
253257
if (found) {
254258
++exploredCount;
255-
if (targetsMap.find(key) != targetsMap.end())
256-
destinations.push_back(u->first);
259+
if (key != targetsQueue.front() && targetsMap.find(key) != targetsMap.end()) {
260+
prev[key] = std::make_pair(u->first,direction);
261+
destinations.push_back(key);
262+
}
257263
}
258264
++edgeCount;
259265
}
@@ -264,16 +270,32 @@ std::pair<bool,std::deque<DBGpath>> DBG::searchVariants(std::pair<const uint64_t
264270
if(edgeCount == exploredCount || depth == userInput.kmerDepth + 1 || destinations.size() >= 10) // everything explored/found, depth reached, or top10
265271
explored = true;
266272
}
267-
if (destinations.size() > 1) { // traverse from target to source, the first path is the reference
273+
if (destinations.size() > 0) { // traverse from target to source, the first path is the reference
268274
for (uint64_t destination : destinations) {
269275
DBGpath newPath;
270-
newPath.sequence = reverseHash(destination);
271-
while (destination != source.first) { // construct the shortest path with a stack S
272-
273-
newPath.sequence.push_back(reverseHash(destination)[k-1]);
274-
275-
dist.erase(destination);
276-
destination = prev[destination].first;
276+
std::string endSequence = reverseHash(destination);
277+
uint16_t i = 0, refLen = std::find(targetsQueue.begin(), targetsQueue.end(), destination) - targetsQueue.begin();
278+
uint64_t prevNode = prev[destination].first;
279+
280+
while (prevNode != source.first) { // construct the shortest path with a stack S
281+
prevNode = prev[prevNode].first;
282+
++i;
283+
}
284+
285+
if (i == refLen)
286+
newPath.type = SNV;
287+
else if (i > refLen)
288+
newPath.type = DEL;
289+
else
290+
newPath.type = INS;
291+
292+
prevNode = prev[destination].first;
293+
bool direction = prev[destination].second;
294+
while (i >= refLen) {
295+
newPath.sequence.push_back(direction ? revCom(reverseHash(prevNode))[0] : revCom(reverseHash(prevNode))[0]);
296+
prevNode = prev[prevNode].first;
297+
direction = prev[prevNode].second;
298+
--i;
277299
}
278300
discoveredPaths.push_back(newPath);
279301
}

0 commit comments

Comments
 (0)