Skip to content

Commit

Permalink
making ubergraph parsing slightly more efficient
Browse files Browse the repository at this point in the history
  • Loading branch information
EvanDietzMorris committed Mar 17, 2023
1 parent 69918ed commit a62a687
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 22 deletions.
29 changes: 16 additions & 13 deletions parsers/UberGraph/src/loadUG.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,24 +80,27 @@ def parse_data(self):
with tar_files.extractfile(f'{self.nonredundant_graph_path}/edges.tsv') as edges_file:
for line in TextIOWrapper(edges_file):
record_counter += 1

if self.test_mode and record_counter == 5000:
break

subject_id, predicate_id, object_id = tuple(line.rstrip().split('\t'))
subject_curie = ubergraph_tools.get_curie_for_node_id(subject_id)
if not subject_curie:
skipped_record_counter += 1
break
object_curie = ubergraph_tools.get_curie_for_node_id(object_id)
if not object_curie:
skipped_record_counter += 1
break
predicate_curie = ubergraph_tools.get_curie_for_edge_id(predicate_id)

if subject_curie and object_curie and predicate_curie:
self.output_file_writer.write_node(node_id=subject_curie)
self.output_file_writer.write_node(node_id=object_curie)
self.output_file_writer.write_edge(subject_id=subject_curie,
object_id=object_curie,
predicate=predicate_curie,
primary_knowledge_source=self.provenance_id)
else:
if not predicate_curie:
skipped_record_counter += 1
break
self.output_file_writer.write_node(node_id=subject_curie)
self.output_file_writer.write_node(node_id=object_curie)
self.output_file_writer.write_edge(subject_id=subject_curie,
object_id=object_curie,
predicate=predicate_curie,
primary_knowledge_source=self.provenance_id)
if self.test_mode and record_counter == 5000:
break

# load up the metadata
load_metadata: dict = {
Expand Down
12 changes: 3 additions & 9 deletions parsers/UberGraph/src/ubergraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ def __init__(self,
ubergraph_archive_path: str = None,
graph_base_path: str = None):

self.ubergraph_archive_path = ubergraph_archive_path
self.graph_base_path = graph_base_path
self.converted_to_curies = False
self.node_curies = {}
self.edge_curies = {}
self.ubergraph_archive_path = ubergraph_archive_path
self.graph_base_path = graph_base_path
self.convert_iris_to_curies()

def convert_iris_to_curies(self):
biolink_prefix_map = self.get_biolink_prefix_map()
Expand Down Expand Up @@ -65,16 +65,10 @@ def convert_iris_to_curies(self):
print(f'No prefix mapping found for: {edge_iri}')
self.edge_curies[edge_id] = edge_curie

self.converted_to_curies = True

def get_curie_for_node_id(self, node_id):
if not self.converted_to_curies:
self.convert_iris_to_curies()
return self.node_curies[node_id]

def get_curie_for_edge_id(self, edge_id):
if not self.converted_to_curies:
self.convert_iris_to_curies()
return self.edge_curies[edge_id]

@staticmethod
Expand Down

0 comments on commit a62a687

Please sign in to comment.