improves caching

WolfgangFahl · Dec 24, 2023 · 9e6d531 · 9e6d531
1 parent 4957efd
commit 9e6d531
Show file tree

Hide file tree

Showing 3 changed files with 83 additions and 26 deletions.
diff --git a/ceurws/__init__.py b/ceurws/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.6"
+__version__ = "0.3.0"
diff --git a/ceurws/utils/json_cache.py b/ceurws/utils/json_cache.py
@@ -3,7 +3,16 @@
 from typing import Union
 
 from orjson import orjson
+from dataclasses import dataclass
+from typing import Optional
+from datetime import datetime
 
+@dataclass
+class CacheInfo:
+    name: str
+    size: int  # size in bytes
+    count: Optional[int] = None  # number of items in the cache, if applicable
+    last_accessed: Optional[datetime] = None  # last accessed timestamp
 
 class JsonCacheManager():
     """
@@ -14,6 +23,8 @@ def __init__(self):
         """
         constructor
         """
+        self.lods={}
+        self.cache_infos={}
 
     def json_path(self, lod_name: str) -> str:
         """
@@ -36,7 +47,6 @@ def load(self, lod_name: str) -> Union[list, dict]:
 
         Args:
             lod_name(str): the name of the list of dicts cache to read
-
         Returns:
             list: the list of dicts
             None: if lod is not cached
@@ -46,14 +56,22 @@ def load(self, lod_name: str) -> Union[list, dict]:
         if self.is_stored(json_path):
             with open(json_path) as json_file:
                 lod = orjson.loads(json_file.read())
+                self.update_cache_info(lod_name,lod)
         return lod
-
+
+    def update_cache_info(self,lod_name,lod):
+        """
+        update my cache info
+        """
+        self.lods[lod_name]=lod
+        self.cache_infos[lod_name]=self.get_cache_info(lod_name)
+
     def is_stored(self, json_path: str) -> bool:
         """
         Returns true if given path exists and is not null
         """
-        return os.path.isfile(json_path) and os.path.getsize(json_path) > 1
-
+        stored= os.path.isfile(json_path) and os.path.getsize(json_path) > 1
+        return stored
 
     def store(self, lod_name: str, lod: Union[list, dict]):
         """
@@ -67,4 +85,24 @@ def store(self, lod_name: str, lod: Union[list, dict]):
         json_str = orjson.dumps(lod, option=orjson.OPT_INDENT_2)
         with open(json_path, 'wb') as json_file:
             json_file.write(json_str)
-            pass
+            pass
+        self.update_cache_info(lod_name,lod)
+
+    def get_cache_info(self, lod_name: str) -> CacheInfo:
+        """
+        Get information about the cache.
+
+        Args:
+            lod_name(str): the name of the list of dicts cache to inspect
+        Returns:
+            CacheInfo: the information about the cache
+        """
+        json_path = self.json_path(lod_name)
+        size = os.path.getsize(json_path) if os.path.isfile(json_path) else 0
+        last_accessed = datetime.fromtimestamp(os.path.getmtime(json_path)) if os.path.isfile(json_path) else None
+        count=0
+        if lod_name in self.lods:
+            lod = self.lods[lod_name]
+            count = len(lod) if lod else 0
+        cache_info = CacheInfo(name=lod_name, size=size, count=count, last_accessed=last_accessed)
+        return cache_info
diff --git a/ceurws/wikidatasync.py b/ceurws/wikidatasync.py
@@ -830,55 +830,69 @@ def __init__(self, endpoint):
         qYamlFile = f"{path}/resources/queries/dblp.yaml"
         if os.path.isfile(qYamlFile):
             self.qm = QueryManager(lang="sparql", queriesPath=qYamlFile)
+        # there is one cache manager for all our json caches
         self.json_cache_manager = JsonCacheManager()
 
-    def get_all_ceur_authors(self) -> List[DblpScholar]:
+    def get_all_ceur_authors(self,force_query: bool=False) -> List[DblpScholar]:
         """
         Get all authors that have published a paper in CEUR-WS from dblp
+        
+        Args:
+            force_query(bool): if True force query
         """
         query = self.qm.queriesByName["CEUR-WS Paper Authors"]
         cache_name = "dblp/authors"
-        lod = JsonCacheManager().load(cache_name)
-        if lod is None:
+        if not force_query:
+            lod = self.json_cache_manager.load(cache_name)
+        if force_query or lod is None:
             lod = self.sparql.queryAsListOfDicts(query.query)
             authors = []
             for d in lod:
                 author = DblpScholar(**d)
                 authors.append(author)
-            JsonCacheManager().store(cache_name, [dataclasses.asdict(author)for author in authors])
+            self.json_cache_manager.store(cache_name, [dataclasses.asdict(author)for author in authors])
         else:
             authors = [DblpScholar(**d) for d in lod]
 
         return authors
 
-    def get_all_ceur_editors(self) -> List[DblpScholar]:
+    def get_all_ceur_editors(self,force_query: bool=False) -> List[DblpScholar]:
         """
         Get all authors that have published a paper in CEUR-WS from dblp
+        
+        Args:
+            force_query(bool): if True force query
+ 
         """
         query = self.qm.queriesByName["CEUR-WS all Editors"]
         cache_name = "dblp/editors"
-        lod = JsonCacheManager().load(cache_name)
-        if lod is None:
+        if not force_query:
+            lod = self.json_cache_manager.load(cache_name)
+        if force_query or lod is None:
             lod = self.sparql.queryAsListOfDicts(query.query)
             editors = []
             for d in lod:
                 editor = DblpScholar(**d)
                 editors.append(editor)
-            JsonCacheManager().store(cache_name, [dataclasses.asdict(editor)for editor in editors])
+            self.json_cache_manager.store(cache_name, [dataclasses.asdict(editor)for editor in editors])
         else:
             editors = [DblpScholar(**d) for d in lod]
         return editors
 
-    def get_all_ceur_papers(self) -> List[DblpPaper]:
+    def get_all_ceur_papers(self, force_query: bool=False) -> List[DblpPaper]:
         """
         Get all papers published in CEUR-WS from dblp
+        
+        Args:
+            force_query(bool): if True force query
         """
         query = self.qm.queriesByName["CEUR-WS all Papers"]
         cache_name = "dblp/papers"
-        lod = JsonCacheManager().load(cache_name)
-        if lod is None:
+        if not force_query:
+            lod = self.json_cache_manager.load(cache_name)
+        if force_query or lod is None:
             lod = self.sparql.queryAsListOfDicts(query.query)
-            authors = self.get_all_ceur_authors()
+            authors = self.get_all_ceur_authors(force_query)
             authorsById = {a.dblp_author_id: a for a in authors}
             papers = []
             for d in lod:
@@ -901,10 +915,10 @@ def get_all_ceur_papers(self) -> List[DblpPaper]:
                         authors=authors
                 )
                 papers.append(paper)
-            JsonCacheManager().store(cache_name, [dataclasses.asdict(paper)for paper in papers])
+            self.json_cache_manager.store(cache_name, [dataclasses.asdict(paper)for paper in papers])
             papers_by_volume = LOD.getLookup(papers, "volume_number", withDuplicates=True)
-            for volume_bumber, vol_papers in papers_by_volume.items():
-                JsonCacheManager().store(f"dblp/Vol-{volume_bumber}/papers", [dataclasses.asdict(paper) for paper in vol_papers])
+            for volume_number, vol_papers in papers_by_volume.items():
+                self.json_cache_manager.store(f"dblp/Vol-{volume_number}/papers", [dataclasses.asdict(paper) for paper in vol_papers])
         else:
             papers = [DblpPaper(**d) for d in lod]
         return papers
@@ -945,14 +959,19 @@ def getDblpIdByVolumeNumber(self, number) -> List[str]:
             qIds = [record.get("proceeding")[len(self.DBLP_REC_PREFIX):] for record in qres]
         return qIds
 
-    def get_all_ceur_proceedings(self) -> List[DblpProceeding]:
+    def get_all_ceur_proceedings(self,force_query:bool=False) -> List[DblpProceeding]:
         """
         Get all proceedings published in CEUR-WS from dblp
+        
+        Args:
+            force_query(bool): if True force query
+   
         """
         query = self.qm.queriesByName["CEUR-WS all Volumes"]
         cache_name = "dblp/volumes"
-        lod = JsonCacheManager().load(cache_name)
-        if lod is None:
+        if not force_query:
+            lod = self.json_cache_manager.load(cache_name)
+        if force_query or lod is None:
             lod = self.sparql.queryAsListOfDicts(query.query)
             editors = self.get_all_ceur_editors()
             editorsById = {a.dblp_author_id: a for a in editors}
@@ -974,10 +993,10 @@ def get_all_ceur_proceedings(self) -> List[DblpProceeding]:
                         papers=papersByProceeding.get(d.get("proceeding"))
                 )
                 volumes.append(volume)
-            JsonCacheManager().store(cache_name, [dataclasses.asdict(volume)for volume in volumes])
+            self.json_cache_manager.store(cache_name, [dataclasses.asdict(volume)for volume in volumes])
             volume_by_number, _errors = LOD.getLookup(volumes, "volume_number")
             for number, volume in volume_by_number.items():
-                JsonCacheManager().store(f"dblp/Vol-{number}/metadata", dataclasses.asdict(volume))
+                self.json_cache_manager.store(f"dblp/Vol-{number}/metadata", dataclasses.asdict(volume))
         else:
             papers = [DblpProceeding(**d) for d in lod]
         return papers