Skip to content

Commit

Permalink
tries fixing --update
Browse files Browse the repository at this point in the history
  • Loading branch information
WolfgangFahl committed Feb 24, 2024
1 parent 0a82f9f commit 5e3ec08
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 14 deletions.
24 changes: 15 additions & 9 deletions ceurws/ceur_ws.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from lodstorage.entity import EntityManager
from lodstorage.jsonable import JSONAble
from lodstorage.storageconfig import StorageConfig
from tqdm import tqdm

from ceurws.indexparser import IndexHtmlParser, ParserConfig
from ceurws.loctime import LoctimeParser
Expand Down Expand Up @@ -365,7 +364,7 @@ def load(self):
load the volumeManager
"""
if Download.needsDownload(CEURWS.CACHE_FILE):
self.loadFromIndexHtml(force=True)
self.loadFromIndexHtml()
else:
self.loadFromBackup()

Expand All @@ -379,12 +378,15 @@ def update(self,parser_config:ParserConfig):
"""
update me by a checking for recently added volumes
"""
max_vol=self.volumes[len(self.volumes)-1]
parser_config.down_to_volume=max_vol.number+1
self.update_or_recreate(parser_config)

def recreate(self,parser_config:ParserConfig):
"""
recreate me by a full parse of all volume files
"""

self.update_or_recreate(parser_config)

def update_or_recreate(self,parser_config:ParserConfig):
Expand All @@ -398,11 +400,16 @@ def update_or_recreate(self,parser_config:ParserConfig):
progress_bar=parser_config.progress_bar
loctime_parser = LoctimeParser()
pm = PaperManager()
if parser_config.down_to_volume!=1:
pm.fromStore(cacheFile=CEURWS.CACHE_FILE)
paper_list = pm.getList()

# first reload me from the main index
self.loadFromIndexHtml(force=True)
self.loadFromIndexHtml(parser_config)
invalid = 0
for volume in self.volumes:
if volume.number < parser_config.down_to_volume:
break
_volume_record, soup = volume.extractValuesFromVolumePage()
if soup:
ptp = PaperTocParser(number=volume.number, soup=soup, debug=self.debug)
Expand Down Expand Up @@ -438,17 +445,16 @@ def update_or_recreate(self,parser_config:ParserConfig):
print(f"storing {len(paper_list)} papers")
pm.store()

def loadFromIndexHtml(self, force: bool = False):
def loadFromIndexHtml(self, parser_config:ParserConfig=None):
"""
load my content from the index.html file
Args:
force(bool): if TRUE fetch index.html
from ceur-ws.org internet homepage
else read locally cached version
parser_config(ParserConfig): the parser Configuration to use
"""
htmlText = self.getIndexHtml(force=force)
indexParser = IndexHtmlParser(htmlText, debug=self.debug)
force=parser_config.force_download if parser_config else True
htmlText = self.getIndexHtml(force)
indexParser = IndexHtmlParser(htmlText,parser_config)
volumeRecords = indexParser.parse()
for volumeRecord in volumeRecords.values():
volume = Volume()
Expand Down
10 changes: 7 additions & 3 deletions ceurws/ceur_ws_web_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ngwidgets.cmd import WebserverCmd
from tabulate import tabulate
from tqdm import tqdm

from ceurws.indexparser import ParserConfig
from ceurws.ceur_ws import VolumeManager
from ceurws.namedqueries import NamedQueries
from ceurws.webserver import CeurWsWebServer
Expand Down Expand Up @@ -103,11 +103,15 @@ def handle_args(self) -> bool:
print(volume)
if args.recreate or args.update:
manager = VolumeManager()
manager.load()
progress_bar=tqdm(total=len(manager.volumes))
parser_config=ParserConfig(progress_bar,
debug=args.debug)

if args.recreate:
manager.recreate(progress_bar=progress_bar)
manager.recreate(parser_config)
else:
manager.update(progress_bar=progress_bar)
manager.update(parser_config)
if args.wikidata_update:
wdsync = WikidataSync.from_args(args)
wdsync.update(withStore=True)
Expand Down
7 changes: 5 additions & 2 deletions ceurws/indexparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,10 @@ def parse(self):
# synchronize on <tr><th and not on end since trailing TR might be missing
lineNo = volStartLine + 1
if "number" in volume:
volumes[volume["number"]] = volume
volume_number=volume["number"]
if volume_number<self.config.down_to_volume:
break
volumes[volume_number] = volume
else:
self.log(f"volume not found for volume at {volStartLine}")
return volumes
return volumes

0 comments on commit 5e3ec08

Please sign in to comment.