Skip to content

Commit 02beeba

Browse files
committed
variable renamed
1 parent 8e7717f commit 02beeba

File tree

1 file changed

+144
-141
lines changed

1 file changed

+144
-141
lines changed

src/kreeq.cpp

+144-141
Original file line numberDiff line numberDiff line change
@@ -887,177 +887,180 @@ void DBG::nextKmerFromString(uint8_t *nextKmer, std::string *sequence, uint64_t
887887

888888
}
889889

890-
bool DBG::isKeyFw(uint64_t key) { // this doesn't make any sense
891-
bool isFw;
892-
std::string kmer = reverseHash(key);
893-
uint8_t thisKmer[k];
894-
for (uint8_t e = 0; e<k; ++e)
895-
thisKmer[e] = ctoi[(unsigned char)kmer[e]];
896-
hash(thisKmer, &isFw);
897-
return isFw;
898-
}
899-
900-
void DBG::DBGgraphToGFA() {
901-
890+
void DBG::collapseNodes() {
891+
902892
uint32_t idCounter = 0, seqPos = 0, edgeCounter = 0;
903893
phmap::flat_hash_map<std::string, unsigned int>& headersToIds = *GFAsubgraph.getHash1();
894+
phmap::parallel_flat_hash_map<uint64_t, std::tuple<DBGkmer32,uint32_t,bool>> residualEdges; // hash, kmer, G' node, node side
904895

905-
if (!userInput.noCollapse) {
896+
auto extend = [&,this] (std::string &seed, int8_t direction) {
906897

907-
phmap::parallel_flat_hash_map<uint64_t, std::tuple<DBGkmer32,uint32_t,bool>> residualEdges; // hash, kmer, G' node, node side
898+
uint64_t key, baseCounter = 0;
899+
bool isFw;
900+
uint8_t nextKmer[k];
901+
for (uint8_t e = 0; e<k; ++e)
902+
nextKmer[e] = ctoi[(unsigned char)seed[e]];
903+
key = hash(nextKmer, &isFw);
908904

909-
auto extend = [&,this] (std::pair<uint64_t, DBGkmer32color> node, std::string &seed, int8_t side) {
905+
std::pair<uint64_t, DBGkmer32color> node = *DBGsubgraph->find(key);
906+
907+
if ((direction ? node.second.fwCount() : node.second.bwCount()) > 1) {
908+
std::cout<<"Branching node side, cannot extend. Terminating."<<std::endl;
909+
exit(EXIT_FAILURE);
910+
}else if ((direction ? node.second.fwCount() : node.second.bwCount()) == 0){
911+
std::cout<<"Dead end, cannot extend. Terminating."<<std::endl;
912+
exit(EXIT_FAILURE);
913+
}
914+
915+
while (true) {
910916

911-
if ((side ? node.second.fwCount() : node.second.bwCount()) > 1) {
912-
std::cout<<"Branching node side, cannot extend. Terminating."<<std::endl;
913-
exit(EXIT_FAILURE);
914-
}else if ((side ? node.second.fwCount() : node.second.bwCount()) == 0){
915-
std::cout<<"Dead end, cannot extend. Terminating."<<std::endl;
916-
exit(EXIT_FAILURE);
917-
}
917+
uint8_t i = isFw ? node.second.fwEdgeIndexes()[0] : 3-node.second.bwEdgeIndexes()[0];
918918

919-
uint64_t key, baseCounter = 0;
920-
bool isFw = isKeyFw(node.first);
921-
isFw = side ? isFw : !isFw;
919+
uint8_t nextKmer[k];
920+
nextKmerFromString(nextKmer, &seed, baseCounter++, i);
921+
key = hash(nextKmer, &isFw);
922922

923-
while (true) {
924-
925-
uint8_t i = isFw ? node.second.fwEdgeIndexes()[0] : 3-node.second.bwEdgeIndexes()[0];
926-
927-
uint8_t nextKmer[k];
928-
nextKmerFromString(nextKmer, &seed, baseCounter++, i);
929-
key = hash(nextKmer, &isFw);
930-
931923
// std::cout<<seed<<std::endl;
932924
// std::cout<<+i<<std::endl;
933-
auto prevNode = node;
934-
auto got = DBGsubgraph->find(key);
935-
936-
if (got == DBGsubgraph->end()) { // we found a novel dead end
937-
auto got = residualEdges.find(key); // we couldn't find the node as it was already visited and deleted
938-
if(got != residualEdges.end()) {
939-
residualEdges[prevNode.first] = std::make_tuple(prevNode.second,idCounter,side);
940-
941-
}
942-
break; // real dead ends
925+
auto prevNode = node;
926+
auto got = DBGsubgraph->find(key);
927+
928+
if (got == DBGsubgraph->end()) { // we found a novel dead end
929+
auto got = residualEdges.find(key); // we couldn't find the node as it was already visited and deleted
930+
if(got != residualEdges.end()) {
931+
residualEdges[prevNode.first] = std::make_tuple(prevNode.second,idCounter,direction);
932+
943933
}
934+
break; // real dead ends
935+
}
944936

945-
node = *got;
937+
node = *got;
946938

947-
std::vector<uint8_t> nextFrontEdges = isFw ? node.second.fwEdgeIndexes() : node.second.bwEdgeIndexes();
948-
std::vector<uint8_t> nextBackEdges = isFw ? node.second.bwEdgeIndexes() : node.second.fwEdgeIndexes();
949-
950-
if(nextBackEdges.size() > 1) { // this node is branching back, we cannot include it
951-
residualEdges[prevNode.first] = std::make_tuple(prevNode.second,idCounter,side);
952-
break;
953-
}
954-
955-
seed.push_back(itoc[i]); // append its base
956-
DBGsubgraph->erase(key); // we can now safely erase as these nodes don't need to be stored
957-
958-
if (nextFrontEdges.size() == 0) { // we found a dead end
959-
break;
960-
} else if (nextFrontEdges.size() > 1) { // we found a potentially fw branching node, if true, nothing more to be done
961-
residualEdges[node.first] = std::make_tuple(node.second,idCounter,side); // we preserve the edge information
962-
break;
963-
}
939+
std::vector<uint8_t> nextFrontEdges = isFw ? node.second.fwEdgeIndexes() : node.second.bwEdgeIndexes();
940+
std::vector<uint8_t> nextBackEdges = isFw ? node.second.bwEdgeIndexes() : node.second.fwEdgeIndexes();
941+
942+
if(nextBackEdges.size() > 1) { // this node is branching back, we cannot include it
943+
residualEdges[prevNode.first] = std::make_tuple(prevNode.second,idCounter,direction);
944+
break;
964945
}
965-
};
966-
967-
while (DBGsubgraph->size() != 0) { // until all nodes have been merged or outputted
968946

969-
auto pair = DBGsubgraph->begin(); // pick a random node
970-
std::string frontSequence = reverseHash(pair->first); // we grow the sequence in both directions
971-
std::string backSequence = revCom(reverseHash(pair->first));
972-
973-
uint8_t edgeCounts[2] = {pair->second.bwCount(), pair->second.fwCount()};
947+
seed.push_back(itoc[i]); // append its base
948+
DBGsubgraph->erase(key); // we can now safely erase as these nodes don't need to be stored
974949

975-
if (edgeCounts[0] == 1 || edgeCounts[1] == 1) { // we are at a branch, otherwise we are in the middle, nothing can be merged safely
976-
977-
for (int8_t side = 1; side >= 0; --side) {
950+
if (nextFrontEdges.size() == 0) { // we found a dead end
951+
break;
952+
} else if (nextFrontEdges.size() > 1) { // we found a potentially fw branching node, if true, nothing more to be done
953+
residualEdges[node.first] = std::make_tuple(node.second,idCounter,direction); // we preserve the edge information
954+
break;
955+
}
956+
}
957+
};
958+
959+
while (DBGsubgraph->size() != 0) { // until all nodes have been merged or outputted
960+
961+
auto pair = DBGsubgraph->begin(); // pick a random node
962+
std::string frontSequence = reverseHash(pair->first); // we grow the sequence in both directions
963+
std::string backSequence = revCom(reverseHash(pair->first));
964+
965+
uint8_t edgeCounts[2] = {pair->second.bwCount(), pair->second.fwCount()};
966+
967+
if (edgeCounts[0] == 1 || edgeCounts[1] == 1) { // we are in the middle or at a partial branch, we can start merging
968+
969+
for (int8_t direction = 1; direction >= 0; --direction) { // we potentially extend in both directions
970+
971+
if (edgeCounts[direction] == 1) { // we can extend if we are at a branch and this the non branching side
978972

979-
if (edgeCounts[side] == 1) { // we can extend if we are at a branch and this the non branching side
980-
981-
extend(*pair, (side ? frontSequence : backSequence), side);
973+
extend((direction ? frontSequence : backSequence), direction);
982974
// std::cout<<"sequence: "<<(!side ? frontSequence : backSequence)<<std::endl;
983-
984-
985-
}else if (edgeCounts[side] > 1){ // if branch, we keep track of neighbours, otherwise it's a dead end and we pick another node
986-
residualEdges[pair->first] = std::make_tuple(pair->second,idCounter,side); // we preserve the edge
987-
}
975+
976+
977+
}else if (edgeCounts[direction] > 1){ // if branch, we keep track of neighbours, otherwise it's a dead end and we pick another node
978+
residualEdges[pair->first] = std::make_tuple(pair->second,idCounter,direction); // we preserve the edge
988979
}
989-
// std::cout<<*sequence->sequence<<std::endl;
990-
}else{
991-
residualEdges[pair->first] = std::make_tuple(pair->second,idCounter,0);
992980
}
993-
994-
Sequence* sequence = new Sequence {std::to_string(idCounter++), "", new std::string(revCom(backSequence) + frontSequence.substr(k))}; // add sequence
995-
std::vector<Tag> inTags = {Tag{'f',"DP",std::to_string(pair->second.cov)},Tag{'Z',"CB",colorPalette(pair->second.color)}};
996-
sequence->seqPos = seqPos++; // remember the order
997-
GFAsubgraph.appendSegment(sequence, inTags);
998-
999-
DBGsubgraph->erase(pair->first);
981+
// std::cout<<*sequence->sequence<<std::endl;
982+
}else{
983+
residualEdges[pair->first] = std::make_tuple(pair->second,idCounter,0);
1000984
}
1001-
jobWait(threadPool);
1002-
while (residualEdges.size() != 0) { // construct the edges
1003-
1004-
auto pair = *residualEdges.begin();
1005-
std::string thisSegmentHeader = std::to_string(std::get<1>(pair.second));
1006-
1007-
for (uint8_t i = 0; i<4; ++i) { // forward edges
1008-
if (std::get<0>(pair.second).fw[i] != 0) {
985+
986+
Sequence* sequence = new Sequence {std::to_string(idCounter++), "", new std::string(revCom(backSequence) + frontSequence.substr(k))}; // add sequence
987+
std::vector<Tag> inTags = {Tag{'f',"DP",std::to_string(pair->second.cov)},Tag{'Z',"CB",colorPalette(pair->second.color)}};
988+
sequence->seqPos = seqPos++; // remember the order
989+
GFAsubgraph.appendSegment(sequence, inTags);
990+
991+
DBGsubgraph->erase(pair->first);
992+
}
993+
jobWait(threadPool);
994+
while (residualEdges.size() != 0) { // construct the edges
995+
996+
auto pair = *residualEdges.begin();
997+
std::string thisSegmentHeader = std::to_string(std::get<1>(pair.second));
998+
999+
for (uint8_t i = 0; i<4; ++i) { // forward edges
1000+
if (std::get<0>(pair.second).fw[i] != 0) {
10091001

1010-
uint8_t nextKmer[k];
1011-
std::string firstKmer = reverseHash(pair.first);
1012-
firstKmer.push_back(itoc[i]);
1013-
for (uint8_t e = 0; e<k; ++e)
1014-
nextKmer[e] = ctoi[(unsigned char)firstKmer[e+1]];
1015-
1016-
bool isFw = false;
1017-
uint64_t key = hash(nextKmer, &isFw);
1018-
auto got = residualEdges.find(key);
1019-
if (got == residualEdges.end())
1020-
continue;
1021-
DBGsubgraph->erase(key);
1022-
std::string nextSegmentHeader = std::to_string(std::get<1>(got->second));
1002+
uint8_t nextKmer[k];
1003+
std::string firstKmer = reverseHash(pair.first);
1004+
firstKmer.push_back(itoc[i]);
1005+
for (uint8_t e = 0; e<k; ++e)
1006+
nextKmer[e] = ctoi[(unsigned char)firstKmer[e+1]];
1007+
1008+
bool isFw = false;
1009+
uint64_t key = hash(nextKmer, &isFw);
1010+
auto got = residualEdges.find(key);
1011+
if (got == residualEdges.end())
1012+
continue;
1013+
DBGsubgraph->erase(key);
1014+
std::string nextSegmentHeader = std::to_string(std::get<1>(got->second));
10231015

1024-
std::vector<Tag> inTags = {Tag{'i',"KC",std::to_string(std::get<0>(pair.second).fw[i])}};
1025-
InEdge edge(idCounter++, edgeCounter, headersToIds[thisSegmentHeader], headersToIds[nextSegmentHeader], std::get<2>(pair.second) ? '+' : '-', std::get<2>(got->second) ? '-' : '+', std::to_string(k-1)+"M", "edge." + std::to_string(edgeCounter), inTags);
1026-
++edgeCounter;
1016+
std::vector<Tag> inTags = {Tag{'i',"KC",std::to_string(std::get<0>(pair.second).fw[i])}};
1017+
InEdge edge(idCounter++, edgeCounter, headersToIds[thisSegmentHeader], headersToIds[nextSegmentHeader], std::get<2>(pair.second) ? '+' : '-', std::get<2>(got->second) ? '-' : '+', std::to_string(k-1)+"M", "edge." + std::to_string(edgeCounter), inTags);
1018+
++edgeCounter;
10271019

1028-
GFAsubgraph.appendEdge(edge);
1029-
}
1020+
GFAsubgraph.appendEdge(edge);
10301021
}
1031-
1032-
for (uint8_t i = 0; i<4; ++i) { // reverse edges
1033-
if (std::get<0>(pair.second).bw[i] != 0) {
1022+
}
1023+
1024+
for (uint8_t i = 0; i<4; ++i) { // reverse edges
1025+
if (std::get<0>(pair.second).bw[i] != 0) {
10341026

1035-
uint8_t nextKmer[k];
1036-
std::string firstKmer;
1037-
firstKmer.push_back(itoc[i]);
1038-
firstKmer.append(reverseHash(pair.first));
1039-
1040-
for (uint8_t e = 0; e<k; ++e)
1041-
nextKmer[e] = ctoi[(unsigned char)firstKmer[e]];
1042-
1043-
bool isFw = false;
1044-
uint64_t key = hash(nextKmer, &isFw);
1045-
auto got = residualEdges.find(key);
1046-
if (got == residualEdges.end())
1047-
continue;
1048-
DBGsubgraph->erase(key);
1049-
std::string prevSegmentHeader = std::to_string(std::get<1>(got->second));
1050-
1051-
std::vector<Tag> inTags = {Tag{'i',"KC",std::to_string(std::get<0>(pair.second).bw[i])}};
1052-
InEdge edge(idCounter++, edgeCounter, headersToIds[prevSegmentHeader], headersToIds[thisSegmentHeader], std::get<2>(got->second) ? '+' : '-', std::get<2>(pair.second) ? '-' : '+', std::to_string(k-1)+"M", "edge." + std::to_string(edgeCounter), inTags);
1053-
++edgeCounter;
1054-
GFAsubgraph.appendEdge(edge);
1055-
}
1027+
uint8_t nextKmer[k];
1028+
std::string firstKmer;
1029+
firstKmer.push_back(itoc[i]);
1030+
firstKmer.append(reverseHash(pair.first));
1031+
1032+
for (uint8_t e = 0; e<k; ++e)
1033+
nextKmer[e] = ctoi[(unsigned char)firstKmer[e]];
1034+
1035+
bool isFw = false;
1036+
uint64_t key = hash(nextKmer, &isFw);
1037+
auto got = residualEdges.find(key);
1038+
if (got == residualEdges.end())
1039+
continue;
1040+
DBGsubgraph->erase(key);
1041+
std::string prevSegmentHeader = std::to_string(std::get<1>(got->second));
1042+
1043+
std::vector<Tag> inTags = {Tag{'i',"KC",std::to_string(std::get<0>(pair.second).bw[i])}};
1044+
InEdge edge(idCounter++, edgeCounter, headersToIds[prevSegmentHeader], headersToIds[thisSegmentHeader], std::get<2>(got->second) ? '+' : '-', std::get<2>(pair.second) ? '-' : '+', std::to_string(k-1)+"M", "edge." + std::to_string(edgeCounter), inTags);
1045+
++edgeCounter;
1046+
GFAsubgraph.appendEdge(edge);
10561047
}
1057-
residualEdges.erase(residualEdges.begin());
10581048
}
1049+
residualEdges.erase(residualEdges.begin());
1050+
}
1051+
1052+
}
1053+
1054+
void DBG::DBGgraphToGFA() {
1055+
1056+
if (!userInput.noCollapse) {
1057+
1058+
collapseNodes();
10591059

10601060
}else{
1061+
1062+
uint32_t idCounter = 0, seqPos = 0, edgeCounter = 0;
1063+
phmap::flat_hash_map<std::string, unsigned int>& headersToIds = *GFAsubgraph.getHash1();
10611064

10621065
phmap::parallel_flat_hash_map<uint64_t, std::string> headerLookupTable;
10631066

0 commit comments

Comments
 (0)