@@ -887,177 +887,180 @@ void DBG::nextKmerFromString(uint8_t *nextKmer, std::string *sequence, uint64_t
887
887
888
888
}
889
889
890
- bool DBG::isKeyFw (uint64_t key) { // this doesn't make any sense
891
- bool isFw;
892
- std::string kmer = reverseHash (key);
893
- uint8_t thisKmer[k];
894
- for (uint8_t e = 0 ; e<k; ++e)
895
- thisKmer[e] = ctoi[(unsigned char )kmer[e]];
896
- hash (thisKmer, &isFw);
897
- return isFw;
898
- }
899
-
900
- void DBG::DBGgraphToGFA () {
901
-
890
+ void DBG::collapseNodes () {
891
+
902
892
uint32_t idCounter = 0 , seqPos = 0 , edgeCounter = 0 ;
903
893
phmap::flat_hash_map<std::string, unsigned int >& headersToIds = *GFAsubgraph.getHash1 ();
894
+ phmap::parallel_flat_hash_map<uint64_t , std::tuple<DBGkmer32,uint32_t ,bool >> residualEdges; // hash, kmer, G' node, node side
904
895
905
- if (!userInput. noCollapse ) {
896
+ auto extend = [&, this ] (std::string &seed, int8_t direction ) {
906
897
907
- phmap::parallel_flat_hash_map<uint64_t , std::tuple<DBGkmer32,uint32_t ,bool >> residualEdges; // hash, kmer, G' node, node side
898
+ uint64_t key, baseCounter = 0 ;
899
+ bool isFw;
900
+ uint8_t nextKmer[k];
901
+ for (uint8_t e = 0 ; e<k; ++e)
902
+ nextKmer[e] = ctoi[(unsigned char )seed[e]];
903
+ key = hash (nextKmer, &isFw);
908
904
909
- auto extend = [&,this ] (std::pair<uint64_t , DBGkmer32color> node, std::string &seed, int8_t side) {
905
+ std::pair<uint64_t , DBGkmer32color> node = *DBGsubgraph->find (key);
906
+
907
+ if ((direction ? node.second .fwCount () : node.second .bwCount ()) > 1 ) {
908
+ std::cout<<" Branching node side, cannot extend. Terminating." <<std::endl;
909
+ exit (EXIT_FAILURE);
910
+ }else if ((direction ? node.second .fwCount () : node.second .bwCount ()) == 0 ){
911
+ std::cout<<" Dead end, cannot extend. Terminating." <<std::endl;
912
+ exit (EXIT_FAILURE);
913
+ }
914
+
915
+ while (true ) {
910
916
911
- if ((side ? node.second .fwCount () : node.second .bwCount ()) > 1 ) {
912
- std::cout<<" Branching node side, cannot extend. Terminating." <<std::endl;
913
- exit (EXIT_FAILURE);
914
- }else if ((side ? node.second .fwCount () : node.second .bwCount ()) == 0 ){
915
- std::cout<<" Dead end, cannot extend. Terminating." <<std::endl;
916
- exit (EXIT_FAILURE);
917
- }
917
+ uint8_t i = isFw ? node.second .fwEdgeIndexes ()[0 ] : 3 -node.second .bwEdgeIndexes ()[0 ];
918
918
919
- uint64_t key, baseCounter = 0 ;
920
- bool isFw = isKeyFw (node. first );
921
- isFw = side ? isFw : !isFw ;
919
+ uint8_t nextKmer[k] ;
920
+ nextKmerFromString (nextKmer, &seed, baseCounter++, i );
921
+ key = hash (nextKmer, & isFw) ;
922
922
923
- while (true ) {
924
-
925
- uint8_t i = isFw ? node.second .fwEdgeIndexes ()[0 ] : 3 -node.second .bwEdgeIndexes ()[0 ];
926
-
927
- uint8_t nextKmer[k];
928
- nextKmerFromString (nextKmer, &seed, baseCounter++, i);
929
- key = hash (nextKmer, &isFw);
930
-
931
923
// std::cout<<seed<<std::endl;
932
924
// std::cout<<+i<<std::endl;
933
- auto prevNode = node;
934
- auto got = DBGsubgraph->find (key);
935
-
936
- if (got == DBGsubgraph->end ()) { // we found a novel dead end
937
- auto got = residualEdges.find (key); // we couldn't find the node as it was already visited and deleted
938
- if (got != residualEdges.end ()) {
939
- residualEdges[prevNode.first ] = std::make_tuple (prevNode.second ,idCounter,side);
940
-
941
- }
942
- break ; // real dead ends
925
+ auto prevNode = node;
926
+ auto got = DBGsubgraph->find (key);
927
+
928
+ if (got == DBGsubgraph->end ()) { // we found a novel dead end
929
+ auto got = residualEdges.find (key); // we couldn't find the node as it was already visited and deleted
930
+ if (got != residualEdges.end ()) {
931
+ residualEdges[prevNode.first ] = std::make_tuple (prevNode.second ,idCounter,direction);
932
+
943
933
}
934
+ break ; // real dead ends
935
+ }
944
936
945
- node = *got;
937
+ node = *got;
946
938
947
- std::vector<uint8_t > nextFrontEdges = isFw ? node.second .fwEdgeIndexes () : node.second .bwEdgeIndexes ();
948
- std::vector<uint8_t > nextBackEdges = isFw ? node.second .bwEdgeIndexes () : node.second .fwEdgeIndexes ();
949
-
950
- if (nextBackEdges.size () > 1 ) { // this node is branching back, we cannot include it
951
- residualEdges[prevNode.first ] = std::make_tuple (prevNode.second ,idCounter,side);
952
- break ;
953
- }
954
-
955
- seed.push_back (itoc[i]); // append its base
956
- DBGsubgraph->erase (key); // we can now safely erase as these nodes don't need to be stored
957
-
958
- if (nextFrontEdges.size () == 0 ) { // we found a dead end
959
- break ;
960
- } else if (nextFrontEdges.size () > 1 ) { // we found a potentially fw branching node, if true, nothing more to be done
961
- residualEdges[node.first ] = std::make_tuple (node.second ,idCounter,side); // we preserve the edge information
962
- break ;
963
- }
939
+ std::vector<uint8_t > nextFrontEdges = isFw ? node.second .fwEdgeIndexes () : node.second .bwEdgeIndexes ();
940
+ std::vector<uint8_t > nextBackEdges = isFw ? node.second .bwEdgeIndexes () : node.second .fwEdgeIndexes ();
941
+
942
+ if (nextBackEdges.size () > 1 ) { // this node is branching back, we cannot include it
943
+ residualEdges[prevNode.first ] = std::make_tuple (prevNode.second ,idCounter,direction);
944
+ break ;
964
945
}
965
- };
966
-
967
- while (DBGsubgraph->size () != 0 ) { // until all nodes have been merged or outputted
968
946
969
- auto pair = DBGsubgraph->begin (); // pick a random node
970
- std::string frontSequence = reverseHash (pair->first ); // we grow the sequence in both directions
971
- std::string backSequence = revCom (reverseHash (pair->first ));
972
-
973
- uint8_t edgeCounts[2 ] = {pair->second .bwCount (), pair->second .fwCount ()};
947
+ seed.push_back (itoc[i]); // append its base
948
+ DBGsubgraph->erase (key); // we can now safely erase as these nodes don't need to be stored
974
949
975
- if (edgeCounts[0 ] == 1 || edgeCounts[1 ] == 1 ) { // we are at a branch, otherwise we are in the middle, nothing can be merged safely
976
-
977
- for (int8_t side = 1 ; side >= 0 ; --side) {
950
+ if (nextFrontEdges.size () == 0 ) { // we found a dead end
951
+ break ;
952
+ } else if (nextFrontEdges.size () > 1 ) { // we found a potentially fw branching node, if true, nothing more to be done
953
+ residualEdges[node.first ] = std::make_tuple (node.second ,idCounter,direction); // we preserve the edge information
954
+ break ;
955
+ }
956
+ }
957
+ };
958
+
959
+ while (DBGsubgraph->size () != 0 ) { // until all nodes have been merged or outputted
960
+
961
+ auto pair = DBGsubgraph->begin (); // pick a random node
962
+ std::string frontSequence = reverseHash (pair->first ); // we grow the sequence in both directions
963
+ std::string backSequence = revCom (reverseHash (pair->first ));
964
+
965
+ uint8_t edgeCounts[2 ] = {pair->second .bwCount (), pair->second .fwCount ()};
966
+
967
+ if (edgeCounts[0 ] == 1 || edgeCounts[1 ] == 1 ) { // we are in the middle or at a partial branch, we can start merging
968
+
969
+ for (int8_t direction = 1 ; direction >= 0 ; --direction) { // we potentially extend in both directions
970
+
971
+ if (edgeCounts[direction] == 1 ) { // we can extend if we are at a branch and this the non branching side
978
972
979
- if (edgeCounts[side] == 1 ) { // we can extend if we are at a branch and this the non branching side
980
-
981
- extend (*pair, (side ? frontSequence : backSequence), side);
973
+ extend ((direction ? frontSequence : backSequence), direction);
982
974
// std::cout<<"sequence: "<<(!side ? frontSequence : backSequence)<<std::endl;
983
-
984
-
985
- }else if (edgeCounts[side] > 1 ){ // if branch, we keep track of neighbours, otherwise it's a dead end and we pick another node
986
- residualEdges[pair->first ] = std::make_tuple (pair->second ,idCounter,side); // we preserve the edge
987
- }
975
+
976
+
977
+ }else if (edgeCounts[direction] > 1 ){ // if branch, we keep track of neighbours, otherwise it's a dead end and we pick another node
978
+ residualEdges[pair->first ] = std::make_tuple (pair->second ,idCounter,direction); // we preserve the edge
988
979
}
989
- // std::cout<<*sequence->sequence<<std::endl;
990
- }else {
991
- residualEdges[pair->first ] = std::make_tuple (pair->second ,idCounter,0 );
992
980
}
993
-
994
- Sequence* sequence = new Sequence {std::to_string (idCounter++), " " , new std::string (revCom (backSequence) + frontSequence.substr (k))}; // add sequence
995
- std::vector<Tag> inTags = {Tag{' f' ," DP" ,std::to_string (pair->second .cov )},Tag{' Z' ," CB" ,colorPalette (pair->second .color )}};
996
- sequence->seqPos = seqPos++; // remember the order
997
- GFAsubgraph.appendSegment (sequence, inTags);
998
-
999
- DBGsubgraph->erase (pair->first );
981
+ // std::cout<<*sequence->sequence<<std::endl;
982
+ }else {
983
+ residualEdges[pair->first ] = std::make_tuple (pair->second ,idCounter,0 );
1000
984
}
1001
- jobWait (threadPool);
1002
- while (residualEdges.size () != 0 ) { // construct the edges
1003
-
1004
- auto pair = *residualEdges.begin ();
1005
- std::string thisSegmentHeader = std::to_string (std::get<1 >(pair.second ));
1006
-
1007
- for (uint8_t i = 0 ; i<4 ; ++i) { // forward edges
1008
- if (std::get<0 >(pair.second ).fw [i] != 0 ) {
985
+
986
+ Sequence* sequence = new Sequence {std::to_string (idCounter++), " " , new std::string (revCom (backSequence) + frontSequence.substr (k))}; // add sequence
987
+ std::vector<Tag> inTags = {Tag{' f' ," DP" ,std::to_string (pair->second .cov )},Tag{' Z' ," CB" ,colorPalette (pair->second .color )}};
988
+ sequence->seqPos = seqPos++; // remember the order
989
+ GFAsubgraph.appendSegment (sequence, inTags);
990
+
991
+ DBGsubgraph->erase (pair->first );
992
+ }
993
+ jobWait (threadPool);
994
+ while (residualEdges.size () != 0 ) { // construct the edges
995
+
996
+ auto pair = *residualEdges.begin ();
997
+ std::string thisSegmentHeader = std::to_string (std::get<1 >(pair.second ));
998
+
999
+ for (uint8_t i = 0 ; i<4 ; ++i) { // forward edges
1000
+ if (std::get<0 >(pair.second ).fw [i] != 0 ) {
1009
1001
1010
- uint8_t nextKmer[k];
1011
- std::string firstKmer = reverseHash (pair.first );
1012
- firstKmer.push_back (itoc[i]);
1013
- for (uint8_t e = 0 ; e<k; ++e)
1014
- nextKmer[e] = ctoi[(unsigned char )firstKmer[e+1 ]];
1015
-
1016
- bool isFw = false ;
1017
- uint64_t key = hash (nextKmer, &isFw);
1018
- auto got = residualEdges.find (key);
1019
- if (got == residualEdges.end ())
1020
- continue ;
1021
- DBGsubgraph->erase (key);
1022
- std::string nextSegmentHeader = std::to_string (std::get<1 >(got->second ));
1002
+ uint8_t nextKmer[k];
1003
+ std::string firstKmer = reverseHash (pair.first );
1004
+ firstKmer.push_back (itoc[i]);
1005
+ for (uint8_t e = 0 ; e<k; ++e)
1006
+ nextKmer[e] = ctoi[(unsigned char )firstKmer[e+1 ]];
1007
+
1008
+ bool isFw = false ;
1009
+ uint64_t key = hash (nextKmer, &isFw);
1010
+ auto got = residualEdges.find (key);
1011
+ if (got == residualEdges.end ())
1012
+ continue ;
1013
+ DBGsubgraph->erase (key);
1014
+ std::string nextSegmentHeader = std::to_string (std::get<1 >(got->second ));
1023
1015
1024
- std::vector<Tag> inTags = {Tag{' i' ," KC" ,std::to_string (std::get<0 >(pair.second ).fw [i])}};
1025
- InEdge edge (idCounter++, edgeCounter, headersToIds[thisSegmentHeader], headersToIds[nextSegmentHeader], std::get<2 >(pair.second ) ? ' +' : ' -' , std::get<2 >(got->second ) ? ' -' : ' +' , std::to_string (k-1 )+" M" , " edge." + std::to_string (edgeCounter), inTags);
1026
- ++edgeCounter;
1016
+ std::vector<Tag> inTags = {Tag{' i' ," KC" ,std::to_string (std::get<0 >(pair.second ).fw [i])}};
1017
+ InEdge edge (idCounter++, edgeCounter, headersToIds[thisSegmentHeader], headersToIds[nextSegmentHeader], std::get<2 >(pair.second ) ? ' +' : ' -' , std::get<2 >(got->second ) ? ' -' : ' +' , std::to_string (k-1 )+" M" , " edge." + std::to_string (edgeCounter), inTags);
1018
+ ++edgeCounter;
1027
1019
1028
- GFAsubgraph.appendEdge (edge);
1029
- }
1020
+ GFAsubgraph.appendEdge (edge);
1030
1021
}
1031
-
1032
- for (uint8_t i = 0 ; i<4 ; ++i) { // reverse edges
1033
- if (std::get<0 >(pair.second ).bw [i] != 0 ) {
1022
+ }
1023
+
1024
+ for (uint8_t i = 0 ; i<4 ; ++i) { // reverse edges
1025
+ if (std::get<0 >(pair.second ).bw [i] != 0 ) {
1034
1026
1035
- uint8_t nextKmer[k];
1036
- std::string firstKmer;
1037
- firstKmer.push_back (itoc[i]);
1038
- firstKmer.append (reverseHash (pair.first ));
1039
-
1040
- for (uint8_t e = 0 ; e<k; ++e)
1041
- nextKmer[e] = ctoi[(unsigned char )firstKmer[e]];
1042
-
1043
- bool isFw = false ;
1044
- uint64_t key = hash (nextKmer, &isFw);
1045
- auto got = residualEdges.find (key);
1046
- if (got == residualEdges.end ())
1047
- continue ;
1048
- DBGsubgraph->erase (key);
1049
- std::string prevSegmentHeader = std::to_string (std::get<1 >(got->second ));
1050
-
1051
- std::vector<Tag> inTags = {Tag{' i' ," KC" ,std::to_string (std::get<0 >(pair.second ).bw [i])}};
1052
- InEdge edge (idCounter++, edgeCounter, headersToIds[prevSegmentHeader], headersToIds[thisSegmentHeader], std::get<2 >(got->second ) ? ' +' : ' -' , std::get<2 >(pair.second ) ? ' -' : ' +' , std::to_string (k-1 )+" M" , " edge." + std::to_string (edgeCounter), inTags);
1053
- ++edgeCounter;
1054
- GFAsubgraph.appendEdge (edge);
1055
- }
1027
+ uint8_t nextKmer[k];
1028
+ std::string firstKmer;
1029
+ firstKmer.push_back (itoc[i]);
1030
+ firstKmer.append (reverseHash (pair.first ));
1031
+
1032
+ for (uint8_t e = 0 ; e<k; ++e)
1033
+ nextKmer[e] = ctoi[(unsigned char )firstKmer[e]];
1034
+
1035
+ bool isFw = false ;
1036
+ uint64_t key = hash (nextKmer, &isFw);
1037
+ auto got = residualEdges.find (key);
1038
+ if (got == residualEdges.end ())
1039
+ continue ;
1040
+ DBGsubgraph->erase (key);
1041
+ std::string prevSegmentHeader = std::to_string (std::get<1 >(got->second ));
1042
+
1043
+ std::vector<Tag> inTags = {Tag{' i' ," KC" ,std::to_string (std::get<0 >(pair.second ).bw [i])}};
1044
+ InEdge edge (idCounter++, edgeCounter, headersToIds[prevSegmentHeader], headersToIds[thisSegmentHeader], std::get<2 >(got->second ) ? ' +' : ' -' , std::get<2 >(pair.second ) ? ' -' : ' +' , std::to_string (k-1 )+" M" , " edge." + std::to_string (edgeCounter), inTags);
1045
+ ++edgeCounter;
1046
+ GFAsubgraph.appendEdge (edge);
1056
1047
}
1057
- residualEdges.erase (residualEdges.begin ());
1058
1048
}
1049
+ residualEdges.erase (residualEdges.begin ());
1050
+ }
1051
+
1052
+ }
1053
+
1054
+ void DBG::DBGgraphToGFA () {
1055
+
1056
+ if (!userInput.noCollapse ) {
1057
+
1058
+ collapseNodes ();
1059
1059
1060
1060
}else {
1061
+
1062
+ uint32_t idCounter = 0 , seqPos = 0 , edgeCounter = 0 ;
1063
+ phmap::flat_hash_map<std::string, unsigned int >& headersToIds = *GFAsubgraph.getHash1 ();
1061
1064
1062
1065
phmap::parallel_flat_hash_map<uint64_t , std::string> headerLookupTable;
1063
1066
0 commit comments