Skip to content

Commit

Permalink
improves caching
Browse files Browse the repository at this point in the history
  • Loading branch information
WolfgangFahl committed Dec 24, 2023
1 parent 4957efd commit 9e6d531
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 26 deletions.
2 changes: 1 addition & 1 deletion ceurws/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.6"
__version__ = "0.3.0"
48 changes: 43 additions & 5 deletions ceurws/utils/json_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,16 @@
from typing import Union

from orjson import orjson
from dataclasses import dataclass
from typing import Optional
from datetime import datetime

@dataclass
class CacheInfo:
name: str
size: int # size in bytes
count: Optional[int] = None # number of items in the cache, if applicable
last_accessed: Optional[datetime] = None # last accessed timestamp

class JsonCacheManager():
"""
Expand All @@ -14,6 +23,8 @@ def __init__(self):
"""
constructor
"""
self.lods={}
self.cache_infos={}

def json_path(self, lod_name: str) -> str:
"""
Expand All @@ -36,7 +47,6 @@ def load(self, lod_name: str) -> Union[list, dict]:
Args:
lod_name(str): the name of the list of dicts cache to read
Returns:
list: the list of dicts
None: if lod is not cached
Expand All @@ -46,14 +56,22 @@ def load(self, lod_name: str) -> Union[list, dict]:
if self.is_stored(json_path):
with open(json_path) as json_file:
lod = orjson.loads(json_file.read())
self.update_cache_info(lod_name,lod)
return lod


def update_cache_info(self,lod_name,lod):
"""
update my cache info
"""
self.lods[lod_name]=lod
self.cache_infos[lod_name]=self.get_cache_info(lod_name)

def is_stored(self, json_path: str) -> bool:
"""
Returns true if given path exists and is not null
"""
return os.path.isfile(json_path) and os.path.getsize(json_path) > 1

stored= os.path.isfile(json_path) and os.path.getsize(json_path) > 1
return stored

def store(self, lod_name: str, lod: Union[list, dict]):
"""
Expand All @@ -67,4 +85,24 @@ def store(self, lod_name: str, lod: Union[list, dict]):
json_str = orjson.dumps(lod, option=orjson.OPT_INDENT_2)
with open(json_path, 'wb') as json_file:
json_file.write(json_str)
pass
pass
self.update_cache_info(lod_name,lod)

def get_cache_info(self, lod_name: str) -> CacheInfo:
"""
Get information about the cache.
Args:
lod_name(str): the name of the list of dicts cache to inspect
Returns:
CacheInfo: the information about the cache
"""
json_path = self.json_path(lod_name)
size = os.path.getsize(json_path) if os.path.isfile(json_path) else 0
last_accessed = datetime.fromtimestamp(os.path.getmtime(json_path)) if os.path.isfile(json_path) else None
count=0
if lod_name in self.lods:
lod = self.lods[lod_name]
count = len(lod) if lod else 0
cache_info = CacheInfo(name=lod_name, size=size, count=count, last_accessed=last_accessed)
return cache_info
59 changes: 39 additions & 20 deletions ceurws/wikidatasync.py
Original file line number Diff line number Diff line change
Expand Up @@ -830,55 +830,69 @@ def __init__(self, endpoint):
qYamlFile = f"{path}/resources/queries/dblp.yaml"
if os.path.isfile(qYamlFile):
self.qm = QueryManager(lang="sparql", queriesPath=qYamlFile)
# there is one cache manager for all our json caches
self.json_cache_manager = JsonCacheManager()

def get_all_ceur_authors(self) -> List[DblpScholar]:
def get_all_ceur_authors(self,force_query: bool=False) -> List[DblpScholar]:
"""
Get all authors that have published a paper in CEUR-WS from dblp
Args:
force_query(bool): if True force query
"""
query = self.qm.queriesByName["CEUR-WS Paper Authors"]
cache_name = "dblp/authors"
lod = JsonCacheManager().load(cache_name)
if lod is None:
if not force_query:
lod = self.json_cache_manager.load(cache_name)
if force_query or lod is None:
lod = self.sparql.queryAsListOfDicts(query.query)
authors = []
for d in lod:
author = DblpScholar(**d)
authors.append(author)
JsonCacheManager().store(cache_name, [dataclasses.asdict(author)for author in authors])
self.json_cache_manager.store(cache_name, [dataclasses.asdict(author)for author in authors])
else:
authors = [DblpScholar(**d) for d in lod]

return authors

def get_all_ceur_editors(self) -> List[DblpScholar]:
def get_all_ceur_editors(self,force_query: bool=False) -> List[DblpScholar]:
"""
Get all authors that have published a paper in CEUR-WS from dblp
Args:
force_query(bool): if True force query
"""
query = self.qm.queriesByName["CEUR-WS all Editors"]
cache_name = "dblp/editors"
lod = JsonCacheManager().load(cache_name)
if lod is None:
if not force_query:
lod = self.json_cache_manager.load(cache_name)
if force_query or lod is None:
lod = self.sparql.queryAsListOfDicts(query.query)
editors = []
for d in lod:
editor = DblpScholar(**d)
editors.append(editor)
JsonCacheManager().store(cache_name, [dataclasses.asdict(editor)for editor in editors])
self.json_cache_manager.store(cache_name, [dataclasses.asdict(editor)for editor in editors])
else:
editors = [DblpScholar(**d) for d in lod]
return editors

def get_all_ceur_papers(self) -> List[DblpPaper]:
def get_all_ceur_papers(self, force_query: bool=False) -> List[DblpPaper]:
"""
Get all papers published in CEUR-WS from dblp
Args:
force_query(bool): if True force query
"""
query = self.qm.queriesByName["CEUR-WS all Papers"]
cache_name = "dblp/papers"
lod = JsonCacheManager().load(cache_name)
if lod is None:
if not force_query:
lod = self.json_cache_manager.load(cache_name)
if force_query or lod is None:
lod = self.sparql.queryAsListOfDicts(query.query)
authors = self.get_all_ceur_authors()
authors = self.get_all_ceur_authors(force_query)
authorsById = {a.dblp_author_id: a for a in authors}
papers = []
for d in lod:
Expand All @@ -901,10 +915,10 @@ def get_all_ceur_papers(self) -> List[DblpPaper]:
authors=authors
)
papers.append(paper)
JsonCacheManager().store(cache_name, [dataclasses.asdict(paper)for paper in papers])
self.json_cache_manager.store(cache_name, [dataclasses.asdict(paper)for paper in papers])
papers_by_volume = LOD.getLookup(papers, "volume_number", withDuplicates=True)
for volume_bumber, vol_papers in papers_by_volume.items():
JsonCacheManager().store(f"dblp/Vol-{volume_bumber}/papers", [dataclasses.asdict(paper) for paper in vol_papers])
for volume_number, vol_papers in papers_by_volume.items():
self.json_cache_manager.store(f"dblp/Vol-{volume_number}/papers", [dataclasses.asdict(paper) for paper in vol_papers])
else:
papers = [DblpPaper(**d) for d in lod]
return papers
Expand Down Expand Up @@ -945,14 +959,19 @@ def getDblpIdByVolumeNumber(self, number) -> List[str]:
qIds = [record.get("proceeding")[len(self.DBLP_REC_PREFIX):] for record in qres]
return qIds

def get_all_ceur_proceedings(self) -> List[DblpProceeding]:
def get_all_ceur_proceedings(self,force_query:bool=False) -> List[DblpProceeding]:
"""
Get all proceedings published in CEUR-WS from dblp
Args:
force_query(bool): if True force query
"""
query = self.qm.queriesByName["CEUR-WS all Volumes"]
cache_name = "dblp/volumes"
lod = JsonCacheManager().load(cache_name)
if lod is None:
if not force_query:
lod = self.json_cache_manager.load(cache_name)
if force_query or lod is None:
lod = self.sparql.queryAsListOfDicts(query.query)
editors = self.get_all_ceur_editors()
editorsById = {a.dblp_author_id: a for a in editors}
Expand All @@ -974,10 +993,10 @@ def get_all_ceur_proceedings(self) -> List[DblpProceeding]:
papers=papersByProceeding.get(d.get("proceeding"))
)
volumes.append(volume)
JsonCacheManager().store(cache_name, [dataclasses.asdict(volume)for volume in volumes])
self.json_cache_manager.store(cache_name, [dataclasses.asdict(volume)for volume in volumes])
volume_by_number, _errors = LOD.getLookup(volumes, "volume_number")
for number, volume in volume_by_number.items():
JsonCacheManager().store(f"dblp/Vol-{number}/metadata", dataclasses.asdict(volume))
self.json_cache_manager.store(f"dblp/Vol-{number}/metadata", dataclasses.asdict(volume))
else:
papers = [DblpProceeding(**d) for d in lod]
return papers
Expand Down

0 comments on commit 9e6d531

Please sign in to comment.