From 86b3f58e7e32f1e5ec0d2a8c98eaf4d93bf87dec Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Tue, 24 Oct 2023 12:38:41 +0300 Subject: [PATCH 01/34] fix(Admin Editors): verify places are real cities --- sefaria/model/place.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sefaria/model/place.py b/sefaria/model/place.py index a39bf0e610..c728a4c141 100644 --- a/sefaria/model/place.py +++ b/sefaria/model/place.py @@ -64,7 +64,11 @@ def create_new_place(cls, en, he=None): def city_to_coordinates(self, city): geolocator = Nominatim(user_agent='hello@sefaria.org') location = geolocator.geocode(city) - self.point_location(lon=location.longitude, lat=location.latitude) + if location and location.raw['type'] in ['administrative', 'city', 'town', 'municipality']: + self.point_location(lon=location.longitude, lat=location.latitude) + else: + raise InputError(f"{city} is not a real city.") + def point_location(self, lon=None, lat=None): if lat is None and lon is None: From 5b89b5fd4e23686088b7d1708dc8b3014304744b Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Mon, 30 Oct 2023 10:08:06 +0200 Subject: [PATCH 02/34] chore: just starting --- sefaria/model/text.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index d8484b59b7..f442da2336 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -2720,7 +2720,8 @@ def __clean_tref(tref, lang): try: # capitalize first letter (don't title case all to avoid e.g., "Song Of Songs") - tref = tref[0].upper() + tref[1:] + # tref = tref[0].upper() + tref[1:] + tref = tref.title() except IndexError: pass @@ -4903,6 +4904,8 @@ def _build_index_maps(self): tree_titles = tree.title_dict(lang) self._index_title_maps[lang][tree.key] = list(tree_titles.keys()) self._title_node_maps[lang].update(tree_titles) + this_node = list(self._title_node_maps[lang].values())[-1] + self._title_node_maps[lang].update({tree.index.title.title(): this_node}) # Add 'Song Of Songs' except IndexSchemaError as e: logger.error("Error in generating title node dictionary: {}".format(e)) From 2ed4f210c5fe5157143091880b2d15a79ef43f57 Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Mon, 30 Oct 2023 10:42:41 +0200 Subject: [PATCH 03/34] Revert "chore: just starting" This reverts commit 5b89b5fd4e23686088b7d1708dc8b3014304744b. --- sefaria/model/text.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index f442da2336..d8484b59b7 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -2720,8 +2720,7 @@ def __clean_tref(tref, lang): try: # capitalize first letter (don't title case all to avoid e.g., "Song Of Songs") - # tref = tref[0].upper() + tref[1:] - tref = tref.title() + tref = tref[0].upper() + tref[1:] except IndexError: pass @@ -4904,8 +4903,6 @@ def _build_index_maps(self): tree_titles = tree.title_dict(lang) self._index_title_maps[lang][tree.key] = list(tree_titles.keys()) self._title_node_maps[lang].update(tree_titles) - this_node = list(self._title_node_maps[lang].values())[-1] - self._title_node_maps[lang].update({tree.index.title.title(): this_node}) # Add 'Song Of Songs' except IndexSchemaError as e: logger.error("Error in generating title node dictionary: {}".format(e)) From c24e8e0738b1565dbd2e67998aeb941b8802aee7 Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Mon, 30 Oct 2023 12:24:51 +0200 Subject: [PATCH 04/34] chore: capitalize non stop words in Ref --- sefaria/model/text.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index d8484b59b7..adac91e16c 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -2717,15 +2717,27 @@ def __clean_tref(tref, lang): return tref tref = tref.replace(":", ".") - - try: - # capitalize first letter (don't title case all to avoid e.g., "Song Of Songs") - tref = tref[0].upper() + tref[1:] - except IndexError: - pass - + tref = Ref.__capitalize_non_stop_words(tref) return tref + @staticmethod + def __capitalize_non_stop_words(tref): + stop_words = ['a', 'haRashba', 'in', 'leYom', 'as', 'min', 'footnotes', 'to', "d'Garmei", 'ben', 'di', 'on', + 'he', 'of', 'part', 'wont', 'haHalakhah', 'or', 'shel', 'by', "la'Nefesh", 'ibn', 'leCheker', + 'according', 'the', 'within', 'haLevanon', 'leYaakov', 'and', 'when', "d'Rav", 'al', 'uMeshiv', + 'with', 'haShamayim', 'who', 'into', 'their', 'is', 'veHaDeot', 'debei', 'other', 'his', 'from', + 'for', 'him'] # currently lowercased words in the titles of books + tref = tref.lower() + temp_tref = "" + prev_w = "" + for i, w in enumerate(tref.split()): + if i == 0 or w not in stop_words or prev_w.endswith(","): + # check previous word ends with comma so that we capitalize 'the' in 'Pesach Haggadah, Magid, The Four Sons 1' + w = w[0].upper() + w[1:] + temp_tref += w + " " + prev_w = w + return temp_tref.strip() + def __reinit_tref(self, new_tref): logger.debug("__reinit_tref from {} to {}".format(self.tref, new_tref)) self.tref = self.__clean_tref(new_tref, self._lang) From 512c91f7b1331c7148693492746e5c35eb1d66ee Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Tue, 31 Oct 2023 12:36:33 +0200 Subject: [PATCH 05/34] Revert "chore: capitalize non stop words in Ref" This reverts commit c24e8e0738b1565dbd2e67998aeb941b8802aee7. --- sefaria/model/text.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index adac91e16c..d8484b59b7 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -2717,26 +2717,14 @@ def __clean_tref(tref, lang): return tref tref = tref.replace(":", ".") - tref = Ref.__capitalize_non_stop_words(tref) - return tref - @staticmethod - def __capitalize_non_stop_words(tref): - stop_words = ['a', 'haRashba', 'in', 'leYom', 'as', 'min', 'footnotes', 'to', "d'Garmei", 'ben', 'di', 'on', - 'he', 'of', 'part', 'wont', 'haHalakhah', 'or', 'shel', 'by', "la'Nefesh", 'ibn', 'leCheker', - 'according', 'the', 'within', 'haLevanon', 'leYaakov', 'and', 'when', "d'Rav", 'al', 'uMeshiv', - 'with', 'haShamayim', 'who', 'into', 'their', 'is', 'veHaDeot', 'debei', 'other', 'his', 'from', - 'for', 'him'] # currently lowercased words in the titles of books - tref = tref.lower() - temp_tref = "" - prev_w = "" - for i, w in enumerate(tref.split()): - if i == 0 or w not in stop_words or prev_w.endswith(","): - # check previous word ends with comma so that we capitalize 'the' in 'Pesach Haggadah, Magid, The Four Sons 1' - w = w[0].upper() + w[1:] - temp_tref += w + " " - prev_w = w - return temp_tref.strip() + try: + # capitalize first letter (don't title case all to avoid e.g., "Song Of Songs") + tref = tref[0].upper() + tref[1:] + except IndexError: + pass + + return tref def __reinit_tref(self, new_tref): logger.debug("__reinit_tref from {} to {}".format(self.tref, new_tref)) From bb9df7d7f15c4bf3ca32357950468845f5dbff81 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Tue, 31 Oct 2023 14:56:21 +0200 Subject: [PATCH 06/34] fix(version language): change actualLanguage when saving a Version that its versionTitle reflects another language. --- sefaria/model/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index d8484b59b7..581b7eee93 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -1302,7 +1302,7 @@ def _validate(self): """ languageCodeRe = re.search(r"\[([a-z]{2})\]$", getattr(self, "versionTitle", None)) if languageCodeRe and languageCodeRe.group(1) != getattr(self,"actualLanguage",None): - raise InputError("Version actualLanguage does not match bracketed language") + self.actualLanguage = languageCodeRe.group(1) if getattr(self,"language", None) not in ["en", "he"]: raise InputError("Version language must be either 'en' or 'he'") index = self.get_index() From 37dc55fd37d3a2e8c416d7784ce4e05219e45356 Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Wed, 1 Nov 2023 10:48:21 +0200 Subject: [PATCH 07/34] fix(Search): case variant repair function Instead of using repairCaseVariant, which only changes "bereshit rabbah 3:15" into "Bereshit rabbah 3:15", I wrote a new function titleCaseExceptStopWords that will capitalize "Rabbah" as well. Also continues to work for "Pirkei avot 4" and "guide for the perplexed" which already work on the site --- static/js/Header.jsx | 2 +- static/js/sefaria/sefaria.js | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/static/js/Header.jsx b/static/js/Header.jsx index e2b87d861a..3cc4309e50 100644 --- a/static/js/Header.jsx +++ b/static/js/Header.jsx @@ -307,7 +307,7 @@ class SearchBar extends Component { .then(d => { // If the query isn't recognized as a ref, but only for reasons of capitalization. Resubmit with recognizable caps. if (Sefaria.isACaseVariant(query, d)) { - this.submitSearch(Sefaria.repairCaseVariant(query, d)); + this.submitSearch(Sefaria.titleCaseExceptStopWords(query, d)); return; } diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js index 196dea6430..71d92dbfd1 100644 --- a/static/js/sefaria/sefaria.js +++ b/static/js/sefaria/sefaria.js @@ -1944,9 +1944,27 @@ _media: {}, data["completions"][0] != query.slice(0, data["completions"][0].length)) }, repairCaseVariant: function(query, data) { - // Used when isACaseVariant() is true to prepare the alternative + // Used by getCaseVariants() to prepare the alternative return data["completions"][0] + query.slice(data["completions"][0].length); }, + titleCaseExceptStopWords: function(str) { + const stopWords = ["and", "or", "the", "a", "in", "an", "is", "of", "for"]; + let result = []; + if (str[0] === ' ') { + result.push(""); + str = str.trim(); + } + const words = str.split(' '); + for (let i = 0; i < words.length; i++) { + // title case each word except for stop words. + if (stopWords.includes(words[i])) { + result.push(words[i].replace(words[i][0], words[i][0].toLowerCase())); + } else { + result.push(words[i].replace(words[i][0], words[i][0].toUpperCase())); + } + } + return result.join(' '); + }, makeSegments: function(data, withContext, sheets=false) { // Returns a flat list of annotated segment objects, // derived from the walking the text in data From a6ed96d30d68d14b40472fd1c9669fd0dc10c412 Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Wed, 1 Nov 2023 10:55:03 +0200 Subject: [PATCH 08/34] chore: restore comment --- static/js/sefaria/sefaria.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js index 002fbc3dc2..784b8b51ae 100644 --- a/static/js/sefaria/sefaria.js +++ b/static/js/sefaria/sefaria.js @@ -1944,7 +1944,7 @@ _media: {}, data["completions"][0] != query.slice(0, data["completions"][0].length)) }, repairCaseVariant: function(query, data) { - // Used by isACaseVariant() to prepare the alternative + // Used when isACaseVariant() is true to prepare the alternative return data["completions"][0] + query.slice(data["completions"][0].length); }, titleCaseExceptStopWords: function(str) { From 222525f84eeef8e2d2ac74b7f923a330a5b44cf2 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Thu, 2 Nov 2023 09:31:14 +0200 Subject: [PATCH 09/34] =?UTF-8?q?feat(ref):=20remove=20stop=5Fparsing=20fo?= =?UTF-8?q?r=20identifying=20hebrew=20refs=20of=20talmud=20with=20line=20(?= =?UTF-8?q?e.g.=20=D7=A1=D7=95=D7=98=D7=94=20=D7=9C=D7=94=20=D7=90:=D7=99?= =?UTF-8?q?=D7=90).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sefaria/model/schema.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sefaria/model/schema.py b/sefaria/model/schema.py index 305dda01b8..efd17d834b 100644 --- a/sefaria/model/schema.py +++ b/sefaria/model/schema.py @@ -2351,11 +2351,6 @@ def _core_regex(self, lang, group_id=None, **kwargs): return reg - def stop_parsing(self, lang): - if lang == "he": - return True - return False - def toNumber(self, lang, s, **kwargs): amud_b_list = ['b', 'B', 'ᵇ'] if lang == "en": From 926b17f4d28cedfd1eb8ccd7f85a0abd7a56bb63 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Thu, 2 Nov 2023 09:33:16 +0200 Subject: [PATCH 10/34] doc(ref): changing doc of stop_parsing to reflect the previous change. --- sefaria/model/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/schema.py b/sefaria/model/schema.py index efd17d834b..b93a8bf84a 100644 --- a/sefaria/model/schema.py +++ b/sefaria/model/schema.py @@ -2075,7 +2075,7 @@ def hebrew_number_regex(): def stop_parsing(self, lang): """ If this is true, the regular expression will stop parsing at this address level for this language. - It is currently checked for only in the first address position, and is used for Hebrew Talmud addresses. + It is currently checked for only in the first address position, and is used for Hebrew Folio addresses (since Folios are just in alt_structs maybe it hos no effect)_. :param lang: "en" or "he" :return bool: """ From 92ecba8f58acd8c4ae79dd2683593bef5770bb9e Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Thu, 2 Nov 2023 10:27:40 +0200 Subject: [PATCH 11/34] fix(timeperiod): replace hyphen by dash in period strings. --- sefaria/model/timeperiod.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sefaria/model/timeperiod.py b/sefaria/model/timeperiod.py index bb0da09560..48db602435 100644 --- a/sefaria/model/timeperiod.py +++ b/sefaria/model/timeperiod.py @@ -144,7 +144,7 @@ def period_string(self, lang): if lang == "en": if getattr(self, "symbol", "") == "CO" or getattr(self, "end", None) is None: - name += " ({}{} {} - )".format( + name += " ({}{} {} – )".format( approxMarker[0], abs(int(self.start)), labels[1]) @@ -155,7 +155,7 @@ def period_string(self, lang): abs(int(self.start)), labels[1]) else: - name += " ({}{} {} - {}{} {})".format( + name += " ({}{} {} – {}{} {})".format( approxMarker[0], abs(int(self.start)), labels[0], @@ -164,7 +164,7 @@ def period_string(self, lang): labels[1]) if lang == "he": if getattr(self, "symbol", "") == "CO" or getattr(self, "end", None) is None: - name += " ({} {} {} - )".format( + name += " ({} {} {} – )".format( abs(int(self.start)), labels[1], approxMarker[0]) @@ -177,7 +177,7 @@ def period_string(self, lang): else: both_approx = approxMarker[0] and approxMarker[1] if both_approx: - name += " ({}{} - {}{} {})".format( + name += " ({}{} – {}{} {})".format( abs(int(self.start)), " " + labels[0] if labels[0] else "", abs(int(self.end)), @@ -185,7 +185,7 @@ def period_string(self, lang): approxMarker[1] ) else: - name += " ({}{}{} - {}{}{})".format( + name += " ({}{}{} – {}{}{})".format( abs(int(self.start)), " " + labels[0] if labels[0] else "", " " + approxMarker[0] if approxMarker[0] else "", From d34dd1bf6db31811d32c672bfbf6270e17937bed Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Thu, 2 Nov 2023 12:12:38 +0200 Subject: [PATCH 12/34] feat(timeperiod): new class for person timeperiod which has different method for period string. --- sefaria/model/timeperiod.py | 59 ++++++++++++++++++++++++++++++++----- sefaria/model/topic.py | 17 +++++++---- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/sefaria/model/timeperiod.py b/sefaria/model/timeperiod.py index 48db602435..30ff0a1df9 100644 --- a/sefaria/model/timeperiod.py +++ b/sefaria/model/timeperiod.py @@ -56,6 +56,8 @@ +---------------+------------+-----------------+-------------------------------+-----------------------+ """ +DASH = '–' + class TimePeriod(abst.AbstractMongoRecord): """ TimePeriod is used both for the saved time periods - Eras and Generations @@ -144,10 +146,11 @@ def period_string(self, lang): if lang == "en": if getattr(self, "symbol", "") == "CO" or getattr(self, "end", None) is None: - name += " ({}{} {} – )".format( + name += " ({}{} {} {} )".format( approxMarker[0], abs(int(self.start)), - labels[1]) + labels[1], + DASH) return name elif int(self.start) == int(self.end): name += " ({}{} {})".format( @@ -155,19 +158,21 @@ def period_string(self, lang): abs(int(self.start)), labels[1]) else: - name += " ({}{} {} – {}{} {})".format( + name += " ({}{} {} {} {}{} {})".format( approxMarker[0], abs(int(self.start)), labels[0], + DASH, approxMarker[1], abs(int(self.end)), labels[1]) if lang == "he": if getattr(self, "symbol", "") == "CO" or getattr(self, "end", None) is None: - name += " ({} {} {} – )".format( + name += " ({} {} {} {} )".format( abs(int(self.start)), labels[1], - approxMarker[0]) + approxMarker[0], + DASH) return name elif int(self.start) == int(self.end): name += " ({}{}{})".format( @@ -177,18 +182,20 @@ def period_string(self, lang): else: both_approx = approxMarker[0] and approxMarker[1] if both_approx: - name += " ({}{} – {}{} {})".format( + name += " ({}{} {} {}{} {})".format( abs(int(self.start)), " " + labels[0] if labels[0] else "", + DASH, abs(int(self.end)), " " + labels[1] if labels[1] else "", approxMarker[1] ) else: - name += " ({}{}{} – {}{}{})".format( + name += " ({}{}{} { {}{}{})".format( abs(int(self.start)), " " + labels[0] if labels[0] else "", " " + approxMarker[0] if approxMarker[0] else "", + DASH, abs(int(self.end)), " " + labels[1] if labels[1] else "", " " + approxMarker[1] if approxMarker[1] else "" @@ -234,3 +241,41 @@ def get_generations(include_doubles = False): arg = {"$in": ["Generation", "Two Generations"]} if include_doubles else "Generation" return TimePeriodSet._get_typed_set(arg) +class PersonTimePeriod(TimePeriod): + + def period_string(self, lang): + + if getattr(self, "start", None) == None and getattr(self, "end", None) == None: + return + + labels = self.getYearLabels(lang) + approxMarker = self.getApproximateMarkers(lang) + abs_birth = abs(int(getattr(self, "start", 0))) + abs_death = abs(int(getattr(self, "end", 0))) + if lang == "en": + birth = 'b.' + death = 'd.' + order_vars_by_lang = lambda year, label, approx: (approx, '', year, label) + else: + birth = 'נו׳' + death = 'נפ׳' + order_vars_by_lang = lambda year, label, approx: (year, ' ', label, approx) + + if getattr(self, "symbol", "") == "CO" or getattr(self, "end", None) is None: + name = '{} {}{}{} {}'.format(birth, *order_vars_by_lang(abs_birth, labels[1], approxMarker[0])) + elif getattr(self, "start", None) is None: + name = '{} {}{}{} {}'.format(death, *order_vars_by_lang(abs_death, labels[1], approxMarker[0])) + elif int(self.start) == int(self.end): + name = '{}{}{} {}'.format(*order_vars_by_lang(abs_birth, labels[1], approxMarker[0])) + else: + both_approx = approxMarker[0] and approxMarker[1] + if lang == 'he' and both_approx: + birth_string = '{}{}{}'.format(*order_vars_by_lang(abs_birth, labels[0], approxMarker[0])[:-1]) + else: + birth_string = '{}{}{} {}'.format(*order_vars_by_lang(abs_birth, labels[0], approxMarker[0])) + death_string = '{}{}{} {}'.format(*order_vars_by_lang(abs_death, labels[1], approxMarker[0])) + name = f'{birth_string} {DASH} {death_string}' + + name = f' ({" ".join(name.split())})' + return name + diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index b259b88311..8b0f6ca283 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -4,7 +4,7 @@ from .text import Ref, IndexSet, AbstractTextRecord from .category import Category from sefaria.system.exceptions import InputError, DuplicateRecordError -from sefaria.model.timeperiod import TimePeriod +from sefaria.model.timeperiod import PersonTimePeriod from sefaria.system.database import db import structlog, bleach from sefaria.model.place import Place @@ -438,23 +438,28 @@ def contents(self, **kwargs): # A person may have an era, a generation, or a specific birth and death years, which each may be approximate. # They may also have none of these... - def most_accurate_time_period(self) -> Optional[TimePeriod]: + def most_accurate_time_period(self) -> Optional[PersonTimePeriod]: if self.get_property("birthYear") and self.get_property("deathYear"): - return TimePeriod({ + return PersonTimePeriod({ "start": self.get_property("birthYear"), "startIsApprox": self.get_property("birthYearIsApprox", False), "end": self.get_property("deathYear"), "endIsApprox": self.get_property("deathYearIsApprox", False) }) elif self.get_property("birthYear") and self.get_property("era", "CO"): - return TimePeriod({ + return PersonTimePeriod({ "start": self.get_property("birthYear"), "startIsApprox": self.get_property("birthYearIsApprox", False), }) + elif self.get_property("deathYear"): + return PersonTimePeriod({ + "end": self.get_property("deathYear"), + "endIsApprox": self.get_property("deathYearIsApprox", False) + }) elif self.get_property("generation"): - return TimePeriod().load({"symbol": self.get_property("generation")}) + return PersonTimePeriod().load({"symbol": self.get_property("generation")}) elif self.get_property("era"): - return TimePeriod().load({"symbol": self.get_property("era")}) + return PersonTimePeriod().load({"symbol": self.get_property("era")}) else: return None From 81ac893efc79c51b65b91fc524f52ec158308e3f Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Thu, 2 Nov 2023 12:12:57 +0200 Subject: [PATCH 13/34] test(timeperiod): change test to feat changes. --- sefaria/model/tests/text_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sefaria/model/tests/text_test.py b/sefaria/model/tests/text_test.py index 00192188e2..97565d2b9d 100644 --- a/sefaria/model/tests/text_test.py +++ b/sefaria/model/tests/text_test.py @@ -169,9 +169,9 @@ def test_invalid_index_save_no_category(): def test_best_time_period(): i = model.library.get_index("Rashi on Genesis") - assert i.best_time_period().period_string('en') == ' (c.1075 - c.1105 CE)' + assert i.best_time_period().period_string('en') == ' (c.1075 – c.1105 CE)' i.compDate = None - assert i.best_time_period().period_string('en') == ' (1040 - 1105 CE)' # now that compDate is None, period_string should return Rashi's birth to death years + assert i.best_time_period().period_string('en') == ' (1040 – 1105 CE)' # now that compDate is None, period_string should return Rashi's birth to death years def test_invalid_index_save_no_hebrew_collective_title(): title = 'Bartenura (The Next Generation)' From 89cd5bb9d39b7c1b9f7a82aa07336f7424b9acc3 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Thu, 2 Nov 2023 13:17:28 +0200 Subject: [PATCH 14/34] refactor(timeperiod): change PersionTimePeriod to LifePeriod, new function for getting TimePeriod from person topic, and use it when getting it for index comp date. --- sefaria/model/tests/text_test.py | 2 +- sefaria/model/timeperiod.py | 2 +- sefaria/model/topic.py | 21 +++++++++++++-------- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/sefaria/model/tests/text_test.py b/sefaria/model/tests/text_test.py index 97565d2b9d..6609c17354 100644 --- a/sefaria/model/tests/text_test.py +++ b/sefaria/model/tests/text_test.py @@ -171,7 +171,7 @@ def test_best_time_period(): i = model.library.get_index("Rashi on Genesis") assert i.best_time_period().period_string('en') == ' (c.1075 – c.1105 CE)' i.compDate = None - assert i.best_time_period().period_string('en') == ' (1040 – 1105 CE)' # now that compDate is None, period_string should return Rashi's birth to death years + assert i.best_time_period().period_string('en') == ' (1040 – 1105 CE)' # now that compDate is None, period_string should return Rashi's birth to death years def test_invalid_index_save_no_hebrew_collective_title(): title = 'Bartenura (The Next Generation)' diff --git a/sefaria/model/timeperiod.py b/sefaria/model/timeperiod.py index 30ff0a1df9..43c8836f54 100644 --- a/sefaria/model/timeperiod.py +++ b/sefaria/model/timeperiod.py @@ -241,7 +241,7 @@ def get_generations(include_doubles = False): arg = {"$in": ["Generation", "Two Generations"]} if include_doubles else "Generation" return TimePeriodSet._get_typed_set(arg) -class PersonTimePeriod(TimePeriod): +class LifePeriod(TimePeriod): def period_string(self, lang): diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index 8b0f6ca283..fe207945f5 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -4,7 +4,7 @@ from .text import Ref, IndexSet, AbstractTextRecord from .category import Category from sefaria.system.exceptions import InputError, DuplicateRecordError -from sefaria.model.timeperiod import PersonTimePeriod +from sefaria.model.timeperiod import TimePeriod, LifePeriod from sefaria.system.database import db import structlog, bleach from sefaria.model.place import Place @@ -422,7 +422,7 @@ def contents(self, **kwargs): d = super(PersonTopic, self).contents(**kwargs) if annotate_time_period: d = self.annotate_place(d) - tp = self.most_accurate_time_period() + tp = self.most_accurate_life_period() if tp is not None: d['timePeriod'] = { "name": { @@ -438,31 +438,36 @@ def contents(self, **kwargs): # A person may have an era, a generation, or a specific birth and death years, which each may be approximate. # They may also have none of these... - def most_accurate_time_period(self) -> Optional[PersonTimePeriod]: + def _most_accurate_period(self, obj) -> Optional[LifePeriod]: if self.get_property("birthYear") and self.get_property("deathYear"): - return PersonTimePeriod({ + return obj({ "start": self.get_property("birthYear"), "startIsApprox": self.get_property("birthYearIsApprox", False), "end": self.get_property("deathYear"), "endIsApprox": self.get_property("deathYearIsApprox", False) }) elif self.get_property("birthYear") and self.get_property("era", "CO"): - return PersonTimePeriod({ + return obj({ "start": self.get_property("birthYear"), "startIsApprox": self.get_property("birthYearIsApprox", False), }) elif self.get_property("deathYear"): - return PersonTimePeriod({ + return obj({ "end": self.get_property("deathYear"), "endIsApprox": self.get_property("deathYearIsApprox", False) }) elif self.get_property("generation"): - return PersonTimePeriod().load({"symbol": self.get_property("generation")}) + return obj().load({"symbol": self.get_property("generation")}) elif self.get_property("era"): - return PersonTimePeriod().load({"symbol": self.get_property("era")}) + return obj().load({"symbol": self.get_property("era")}) else: return None + def most_accurate_time_period(self): + return self._most_accurate_period(TimePeriod) + + def most_accurate_life_period(self): + return self._most_accurate_period(LifePeriod) class AuthorTopic(PersonTopic): """ From e1f76add52b139eaec38745da4bd6afcbc30c713 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Sun, 5 Nov 2023 10:01:04 +0200 Subject: [PATCH 15/34] refactor(timeperiod): replace name obj by time_period_class and add typing. --- sefaria/model/topic.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index fe207945f5..4b1e4f028f 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -9,6 +9,7 @@ import structlog, bleach from sefaria.model.place import Place import regex as re +from typing import Type logger = structlog.get_logger(__name__) @@ -438,28 +439,28 @@ def contents(self, **kwargs): # A person may have an era, a generation, or a specific birth and death years, which each may be approximate. # They may also have none of these... - def _most_accurate_period(self, obj) -> Optional[LifePeriod]: + def _most_accurate_period(self, time_period_class: Type[TimePeriod]) -> Optional[LifePeriod]: if self.get_property("birthYear") and self.get_property("deathYear"): - return obj({ + return time_period_class({ "start": self.get_property("birthYear"), "startIsApprox": self.get_property("birthYearIsApprox", False), "end": self.get_property("deathYear"), "endIsApprox": self.get_property("deathYearIsApprox", False) }) elif self.get_property("birthYear") and self.get_property("era", "CO"): - return obj({ + return time_period_class({ "start": self.get_property("birthYear"), "startIsApprox": self.get_property("birthYearIsApprox", False), }) elif self.get_property("deathYear"): - return obj({ + return time_period_class({ "end": self.get_property("deathYear"), "endIsApprox": self.get_property("deathYearIsApprox", False) }) elif self.get_property("generation"): - return obj().load({"symbol": self.get_property("generation")}) + return time_period_class().load({"symbol": self.get_property("generation")}) elif self.get_property("era"): - return obj().load({"symbol": self.get_property("era")}) + return time_period_class().load({"symbol": self.get_property("era")}) else: return None From 92d00bf55d17d393c4153edb93167016858e2079 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Sun, 5 Nov 2023 10:04:21 +0200 Subject: [PATCH 16/34] docs(timeperiod): doc strings for most_accurate_time_period and most_accurate_life_period. --- sefaria/model/topic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index 4b1e4f028f..a6f9d7c549 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -465,9 +465,15 @@ def _most_accurate_period(self, time_period_class: Type[TimePeriod]) -> Optional return None def most_accurate_time_period(self): + ''' + :return: most accurate period as TimePeriod (used when a person's LifePeriod is using as a a general TimePeriod) + ''' return self._most_accurate_period(TimePeriod) def most_accurate_life_period(self): + ''' + :return: most accurate period as LifePeriod. currently the difference from TimePeriod is only the string + ''' return self._most_accurate_period(LifePeriod) class AuthorTopic(PersonTopic): From 16bdd240f2749b4d133edf69b9fab365c0b11b2a Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Sun, 5 Nov 2023 10:14:52 +0200 Subject: [PATCH 17/34] test(ref): test for hebrew talmud ref with line. --- sefaria/model/tests/he_ref_test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sefaria/model/tests/he_ref_test.py b/sefaria/model/tests/he_ref_test.py index 55fa32975e..b5c615b9ed 100644 --- a/sefaria/model/tests/he_ref_test.py +++ b/sefaria/model/tests/he_ref_test.py @@ -100,6 +100,10 @@ def test_talmud(self): assert r.sections[0] == 58 assert len(r.sections) == 1 + r = m.Ref("סוטה לה א:יא") + assert r.book == 'Sotah' + assert r.sections == [69, 11] + def test_length_catching(self): with pytest.raises(InputError): r = m.Ref('דברים שם') From 7ada104eebf0cc65ccea91f512d99c26d172896e Mon Sep 17 00:00:00 2001 From: Noah Santacruz Date: Sun, 5 Nov 2023 10:20:50 +0200 Subject: [PATCH 18/34] docs(timeperiod): clarify docs. fix typos. --- sefaria/model/topic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index a6f9d7c549..30e92c29b1 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -466,7 +466,7 @@ def _most_accurate_period(self, time_period_class: Type[TimePeriod]) -> Optional def most_accurate_time_period(self): ''' - :return: most accurate period as TimePeriod (used when a person's LifePeriod is using as a a general TimePeriod) + :return: most accurate period as TimePeriod (used when a person's LifePeriod should be formatted like a general TimePeriod) ''' return self._most_accurate_period(TimePeriod) From 3a00bf91f032128b3ce1d3e279e2641526cde822 Mon Sep 17 00:00:00 2001 From: Noah Santacruz Date: Sun, 5 Nov 2023 10:20:59 +0200 Subject: [PATCH 19/34] docs(timeperiod): clarify docs. fix typos. --- sefaria/model/topic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index 30e92c29b1..37ffcae5b7 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -472,7 +472,7 @@ def most_accurate_time_period(self): def most_accurate_life_period(self): ''' - :return: most accurate period as LifePeriod. currently the difference from TimePeriod is only the string + :return: most accurate period as LifePeriod. currently the only difference from TimePeriod is the way the time period is formatted as a string. ''' return self._most_accurate_period(LifePeriod) From 356765366b0be31d82493138ef709c220912c30d Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Sun, 5 Nov 2023 15:21:34 +0200 Subject: [PATCH 20/34] refactor(ref): remove stop_parsing since it has no use. --- sefaria/model/schema.py | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/sefaria/model/schema.py b/sefaria/model/schema.py index b93a8bf84a..fe9537c7d3 100644 --- a/sefaria/model/schema.py +++ b/sefaria/model/schema.py @@ -1073,12 +1073,11 @@ def full_regex(self, title, lang, anchored=True, compiled=True, capture_title=Fa def address_regex(self, lang, **kwargs): group = "a0" reg = self._addressTypes[0].regex(lang, group, **kwargs) - if not self._addressTypes[0].stop_parsing(lang): - for i in range(1, self.depth): - group = "a{}".format(i) - reg += "(" + self.after_address_delimiter_ref + self._addressTypes[i].regex(lang, group, **kwargs) + ")" - if not kwargs.get("strict", False): - reg += "?" + for i in range(1, self.depth): + group = "a{}".format(i) + reg += "(" + self.after_address_delimiter_ref + self._addressTypes[i].regex(lang, group, **kwargs) + ")" + if not kwargs.get("strict", False): + reg += "?" if kwargs.get("match_range"): # TODO there is a potential error with this regex. it fills in toSections starting from highest depth and going to lowest. @@ -1089,14 +1088,13 @@ def address_regex(self, lang, **kwargs): reg += r"(?=\S)" # must be followed by something (Lookahead) group = "ar0" reg += self._addressTypes[0].regex(lang, group, **kwargs) - if not self._addressTypes[0].stop_parsing(lang): - reg += "?" - for i in range(1, self.depth): - reg += r"(?:(?:" + self.after_address_delimiter_ref + r")?" - group = "ar{}".format(i) - reg += "(" + self._addressTypes[i].regex(lang, group, **kwargs) + ")" - # assuming strict isn't relevant on ranges # if not kwargs.get("strict", False): - reg += ")?" + reg += "?" + for i in range(1, self.depth): + reg += r"(?:(?:" + self.after_address_delimiter_ref + r")?" + group = "ar{}".format(i) + reg += "(" + self._addressTypes[i].regex(lang, group, **kwargs) + ")" + # assuming strict isn't relevant on ranges # if not kwargs.get("strict", False): + reg += ")?" reg += r")?" # end range clause return reg @@ -2072,15 +2070,6 @@ def hebrew_number_regex(): [\u05d0-\u05d8]? # One or zero alef-tet (1-9) )""" - def stop_parsing(self, lang): - """ - If this is true, the regular expression will stop parsing at this address level for this language. - It is currently checked for only in the first address position, and is used for Hebrew Folio addresses (since Folios are just in alt_structs maybe it hos no effect)_. - :param lang: "en" or "he" - :return bool: - """ - return False - def toNumber(self, lang, s): """ Return the numerical form of s in this address scheme @@ -2485,11 +2474,6 @@ def _core_regex(self, lang, group_id=None, **kwargs): return reg - def stop_parsing(self, lang): - if lang == "he": - return True - return False - def toNumber(self, lang, s, **kwargs): if lang == "en": try: From 141106c059ed85cd8633c57fd6e0f456e4fdf279 Mon Sep 17 00:00:00 2001 From: Lev Israel Date: Sun, 5 Nov 2023 17:30:14 +0200 Subject: [PATCH 21/34] chore: Update VersionState Docs --- sefaria/model/version_state.py | 81 +++++++++++----------------------- 1 file changed, 25 insertions(+), 56 deletions(-) diff --git a/sefaria/model/version_state.py b/sefaria/model/version_state.py index 62aea25423..8b1b4a95df 100644 --- a/sefaria/model/version_state.py +++ b/sefaria/model/version_state.py @@ -20,73 +20,42 @@ except ImportError: USE_VARNISH = False ''' -old count docs were: - c["allVersionCounts"] - c["availableTexts"] = { - "en": - "he": - } - - c["availableCounts"] = { # - "en": - "he": - } - - c["percentAvailable"] = { - "he": - "en": - } - - c["textComplete"] = { - "he": - "en" - } - - c['estimatedCompleteness'] = { - "he": { - 'estimatedPercent': - 'availableSegmentCount': # is availableCounts[-1] - 'percentAvailableInvalid': - 'percentAvailable': # duplicate - 'isSparse': - } - "en": - } - - -and now self.content is: - { - "_en": { - "availableTexts": - "availableCounts": - "percentAvailable": - "textComplete": - 'completenessPercent': - 'percentAvailableInvalid': - 'sparseness': # was isSparse - } - "_he": ... - "_all" { - "availableTexts": - "shape": - For depth 1: Integer - length - For depth 2: List of chapter lengths - For depth 3: List of list of chapter lengths? - } - } - ''' class VersionState(abst.AbstractMongoRecord, AbstractSchemaContent): """ This model overrides default init/load/save behavior, since there is one and only one VersionState record for each Index record. + + The `content` attribute is a dictionary which is the root of a tree, mirroring the shape of a Version, where the leaf nodes of the tree are dictionaries with a shape like the following:: + { + "_en": { + "availableTexts": + "availableCounts": + "percentAvailable": + "textComplete": + 'completenessPercent': + 'percentAvailableInvalid': + 'sparseness': # was isSparse + } + "_he": ... + "_all" { + "availableTexts": + "shape": + For depth 1: Integer - length + For depth 2: List of chapter lengths + For depth 3: List of list of chapter lengths? + } + } + + For example, the `content` attribute for `Pesach Haggadah` will be a dictionary with keys: "Kadesh", "Urchatz", "Karpas" ... each with a value of a dictionary like the above. They key "Magid" has a value of dictionary with keys "Ha Lachma Anya", etc. + """ collection = 'vstate' required_attrs = [ "title", # Index title - "content" # tree of data about nodes + "content" # tree of data about nodes. See above. ] optional_attrs = [ "flags", From 4ed4179fa8b01f0251c2d3dc5c293c2601a81a14 Mon Sep 17 00:00:00 2001 From: Ephraim Date: Mon, 6 Nov 2023 09:13:33 +0200 Subject: [PATCH 22/34] refactor(js-general): Remove unused React dependencies from sefaria.js This is done to ensure that sefaria.js uses vanilla js and does not become dependent on React --- static/js/sefaria/sefaria.js | 2 -- 1 file changed, 2 deletions(-) diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js index 28aeffcae1..ff019ec44c 100644 --- a/static/js/sefaria/sefaria.js +++ b/static/js/sefaria/sefaria.js @@ -9,8 +9,6 @@ import Track from './track'; import Hebrew from './hebrew'; import Util from './util'; import $ from './sefariaJquery'; -import {useContext} from "react"; -import {ContentLanguageContext} from "../context"; let Sefaria = Sefaria || { From d07f6658ff5ef3f7c6431c8c18a47e86027f19a0 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Mon, 6 Nov 2023 10:40:28 +0200 Subject: [PATCH 23/34] fix(timePeriod): typo. --- sefaria/model/timeperiod.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/timeperiod.py b/sefaria/model/timeperiod.py index 43c8836f54..5e94b33fc8 100644 --- a/sefaria/model/timeperiod.py +++ b/sefaria/model/timeperiod.py @@ -191,7 +191,7 @@ def period_string(self, lang): approxMarker[1] ) else: - name += " ({}{}{} { {}{}{})".format( + name += " ({}{}{} {} {}{}{})".format( abs(int(self.start)), " " + labels[0] if labels[0] else "", " " + approxMarker[0] if approxMarker[0] else "", From ec31bcbccca1ad37da00dd9fa9369114d1b1fd63 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Mon, 6 Nov 2023 11:10:29 +0200 Subject: [PATCH 24/34] feat(timePeriod): add determine_year_estimate to TimePeriod. --- sefaria/model/timeperiod.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sefaria/model/timeperiod.py b/sefaria/model/timeperiod.py index 5e94b33fc8..33bd3d389a 100644 --- a/sefaria/model/timeperiod.py +++ b/sefaria/model/timeperiod.py @@ -225,6 +225,16 @@ def get_people_in_generation(self, include_doubles = True): else: return topic.Topic({"properties.generation.value": self.symbol}) + def determine_year_estimate(self): + start = getattr(self, 'start', None) + end = getattr(self, 'end', None) + if start != None and end != None: + return round((int(start) + int(end)) / 2) + elif start != None: + return int(start) + elif end != None: + return int(end) + class TimePeriodSet(abst.AbstractMongoSet): recordClass = TimePeriod From 585b6a056a90c7a4b2de24dfe74caf5fd5e7e8de Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Mon, 6 Nov 2023 11:12:07 +0200 Subject: [PATCH 25/34] fix(elasticSearch): use TimePeriod's determine_year_estimate function rather than the start attribute. --- sefaria/pagesheetrank.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sefaria/pagesheetrank.py b/sefaria/pagesheetrank.py index 1c2087ed2b..603de805bc 100644 --- a/sefaria/pagesheetrank.py +++ b/sefaria/pagesheetrank.py @@ -212,8 +212,8 @@ def put_link_in_graph(ref1, ref2, weight=1.0): refs = [Ref(r) for r in link.refs] tp1 = refs[0].index.best_time_period() tp2 = refs[1].index.best_time_period() - start1 = int(tp1.start) if tp1 else 3000 - start2 = int(tp2.start) if tp2 else 3000 + start1 = int(tp1.determine_year_estimate()) if tp1 else 3000 + start2 = int(tp2.determine_year_estimate()) if tp2 else 3000 older_ref, newer_ref = (refs[0], refs[1]) if start1 < start2 else (refs[1], refs[0]) From f6807bec631286a75fe89caa02cfcc18654790d3 Mon Sep 17 00:00:00 2001 From: Brendan Galloway Date: Mon, 6 Nov 2023 13:36:17 +0200 Subject: [PATCH 26/34] helm(fix): increase backup pod storage size --- helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml b/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml index ba6f6f37d7..0a0ea29f2e 100644 --- a/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml +++ b/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml @@ -58,6 +58,8 @@ spec: command: ["bash"] args: ["-c", "/scripts/create-dumps.sh"] resources: + requests: + ephemeral-storage: 20Gi limits: memory: "500Mi" containers: From d980b850e86b65ed97dbda168777a451cd4ada5a Mon Sep 17 00:00:00 2001 From: Lev Israel Date: Mon, 6 Nov 2023 15:09:45 +0200 Subject: [PATCH 27/34] chore: Improve VersionState docs. Note potential refactors. --- sefaria/model/version_state.py | 35 ++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/sefaria/model/version_state.py b/sefaria/model/version_state.py index 8b1b4a95df..d5021cbd04 100644 --- a/sefaria/model/version_state.py +++ b/sefaria/model/version_state.py @@ -30,25 +30,28 @@ class VersionState(abst.AbstractMongoRecord, AbstractSchemaContent): The `content` attribute is a dictionary which is the root of a tree, mirroring the shape of a Version, where the leaf nodes of the tree are dictionaries with a shape like the following:: { "_en": { - "availableTexts": - "availableCounts": - "percentAvailable": - "textComplete": - 'completenessPercent': - 'percentAvailableInvalid': - 'sparseness': # was isSparse + "availableTexts": Mask of what texts are available in this language. Boolean values (0 or 1) in the shape of the JaggedArray + "availableCounts": Array, with length == depth of the node. Each element is the number of available elements at that depth. e.g [chapters, verses] + "percentAvailable": Percent of this text available in this language TODO: Only used on the dashboard. Remove? + 'percentAvailableInvalid': Boolean. Whether the value of "percentAvailable" can be trusted. TODO: Only used on the dashboard. Remove? + "textComplete": Boolean. Whether the text is complete in this language. TODO: Not used outside of this file. Should be removed. + 'completenessPercent': Percent of this text complete in this language TODO: Not used outside of this file. Should be removed. + 'sparseness': Legacy - present on some records, but no longer in code TODO: remove } - "_he": ... + "_he": {...} # same keys as _en "_all" { - "availableTexts": + "availableTexts": Mask what texts are available in this text overall. Boolean values (0 or 1) in the shape of the JaggedArray "shape": - For depth 1: Integer - length - For depth 2: List of chapter lengths - For depth 3: List of list of chapter lengths? + For depth 1: Integer -length + For depth 2: List of section lengths + For depth 3: List of list of section lengths } } - For example, the `content` attribute for `Pesach Haggadah` will be a dictionary with keys: "Kadesh", "Urchatz", "Karpas" ... each with a value of a dictionary like the above. They key "Magid" has a value of dictionary with keys "Ha Lachma Anya", etc. + For example: + - the `content` attribute for a simple text like `Genesis` will be a dictionary with keys "_en", "_he", and "_all", as above. + - the `content` attribute for `Pesach Haggadah` will be a dictionary with keys: "Kadesh", "Urchatz", "Karpas" ... each with a value of a dictionary like the above. + The key "Magid" has a value of dictionary with keys "Ha Lachma Anya", etc. """ collection = 'vstate' @@ -58,9 +61,9 @@ class VersionState(abst.AbstractMongoRecord, AbstractSchemaContent): "content" # tree of data about nodes. See above. ] optional_attrs = [ - "flags", - "linksCount", - "first_section_ref" + "flags", # "heComplete" : Bool, "enComplete" : Bool + "linksCount", # Integer + "first_section_ref" # Normal text Ref ] langs = ["en", "he"] From 1e1779f4471e3cc3aca6447286f689cf9fd1471e Mon Sep 17 00:00:00 2001 From: Brendan Galloway Date: Tue, 7 Nov 2023 11:11:10 +0200 Subject: [PATCH 28/34] helm(fix): increase ephemeral storage for backup job --- .../sefaria-project/templates/cronjob/mongo-backup.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml b/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml index 0a0ea29f2e..9c4ec49e9f 100644 --- a/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml +++ b/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml @@ -59,7 +59,7 @@ spec: args: ["-c", "/scripts/create-dumps.sh"] resources: requests: - ephemeral-storage: 20Gi + ephemeral-storage: 30Gi limits: memory: "500Mi" containers: @@ -100,7 +100,8 @@ spec: name: upload-dumps-{{ .Values.deployEnv }} defaultMode: 0755 - name: shared-volume - emptyDir: {} + emptyDir: + sizeLimit: 30Gi successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 2 {{- end }} From 3ffdac3e37f0e1c0b4c9fe7fa5d685646d59ba12 Mon Sep 17 00:00:00 2001 From: Lev Israel Date: Tue, 7 Nov 2023 11:31:47 +0200 Subject: [PATCH 29/34] chore: Improve version_state docs --- sefaria/model/version_state.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sefaria/model/version_state.py b/sefaria/model/version_state.py index d5021cbd04..2fef65b10e 100644 --- a/sefaria/model/version_state.py +++ b/sefaria/model/version_state.py @@ -27,7 +27,7 @@ class VersionState(abst.AbstractMongoRecord, AbstractSchemaContent): """ This model overrides default init/load/save behavior, since there is one and only one VersionState record for each Index record. - The `content` attribute is a dictionary which is the root of a tree, mirroring the shape of a Version, where the leaf nodes of the tree are dictionaries with a shape like the following:: + The `content` attribute is a dictionary which is the root of a tree, mirroring the shape of a Version, where the leaf nodes of the tree are dictionaries with a shape like the following: { "_en": { "availableTexts": Mask of what texts are available in this language. Boolean values (0 or 1) in the shape of the JaggedArray @@ -51,8 +51,11 @@ class VersionState(abst.AbstractMongoRecord, AbstractSchemaContent): For example: - the `content` attribute for a simple text like `Genesis` will be a dictionary with keys "_en", "_he", and "_all", as above. - the `content` attribute for `Pesach Haggadah` will be a dictionary with keys: "Kadesh", "Urchatz", "Karpas" ... each with a value of a dictionary like the above. - The key "Magid" has a value of dictionary with keys "Ha Lachma Anya", etc. + The key "Magid" has a value of a dictionary, where each key is a different sub-section of Magid. + The value for each key is a dictionary as detailed above, specific to each sub-section. + So for example, one key will be "Ha Lachma Anya" and the value will be a dictionary, like the above, specific to the details of "Ha Lachma Anya". + Every JaggedArrayNode has a corresponding vstate dictionary. So for complex texts, each leaf node (and leaf nodes by definition must be JaggedArrayNodes) has this corresponding dictionary. """ collection = 'vstate' From e7603fe43d46c027f5c835bcf1f7dd7d48573a41 Mon Sep 17 00:00:00 2001 From: Lev Israel Date: Tue, 7 Nov 2023 11:45:03 +0200 Subject: [PATCH 30/34] chore: Remove count model --- sefaria/model/count.py | 93 ------------------------------------------ 1 file changed, 93 deletions(-) delete mode 100644 sefaria/model/count.py diff --git a/sefaria/model/count.py b/sefaria/model/count.py deleted file mode 100644 index 63b31a9552..0000000000 --- a/sefaria/model/count.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -count.py -Writes to MongoDB Collection: counts -""" -import structlog -logger = structlog.get_logger(__name__) - -from . import abstract as abst -import sefaria.datatype.jagged_array as ja -from sefaria.system.exceptions import BookNameError - - -class Count(abst.AbstractMongoRecord): - """ - """ - collection = 'counts' - - required_attrs = [ - "textComplete", - "percentAvailable", - "availableCounts" - ] - optional_attrs = [ - "categories", - "availableTexts", - "title", - "linksCount", - "estimatedCompleteness", - "flags", - "allVersionCounts" - ] - - def _set_derived_attributes(self): - from . import text - - if getattr(self, "title", None): - try: - indx = text.library.get_index(self.title) - attrs = indx.contents() - #del attrs["_id"] - self.index_attr_keys = list(attrs.keys()) - self.__dict__.update(attrs) - except BookNameError as e: - logger.warning("Count object failed to get Index for {} : {} Normal right after Index name change.".format(self.title, e)) - - #todo: this needs to be considered. What happens when the data is modified? etc. - if getattr(self, "allVersionCounts", None) is not None: - self._allVersionCountsJA = ja.JaggedIntArray(self.allVersionCounts) - - #remove uneccesary and dangerous categories attr from text counts - #This assumes that category nodes have no title element - #todo: review this. Do we need to subclass text and category counts? - def _saveable_attr_keys(self): - attrs = super(Count, self)._saveable_attr_keys() - if getattr(self, "title", None): - attrs.remove("categories") - return attrs - - def contents(self, **kwargs): - attrs = super(Count, self).contents() - for key in self.index_attr_keys: - attrs[key] = getattr(self, key, None) - return attrs - - #deprecated - use JA directly - def next_address(self, starting_points=None): - starting_points = starting_points or [] - if len(starting_points) > 0: - starting_points[-1] += 1 - return self._allVersionCountsJA.next_index(starting_points) - - #deprecated - use JA directly - def prev_address(self, starting_points=None): - starting_points = starting_points or [] - if len(starting_points) > 0: - starting_points[-1] -= 1 - return self._allVersionCountsJA.prev_index(starting_points) - - #deprecated - use JA directly - def section_length(self, section_numbers): - """ - :param section_numbers: The list of 1-based (E.g. Chapter 5 is section_number 5) section numbers - :return: The length of that section - """ - return self._allVersionCountsJA.sub_array_length([s - 1 for s in section_numbers]) - - -class CountSet(abst.AbstractMongoSet): - recordClass = Count - - -def process_index_delete_in_counts(indx, **kwargs): - CountSet({"title":indx.title}).delete() From bd0744d128c41ec72ad0c3cea41a75fe0a66ffe2 Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Tue, 7 Nov 2023 11:56:30 +0200 Subject: [PATCH 31/34] fix(Search): improve repairCaseVariant to handle multiple words --- static/js/Header.jsx | 2 +- static/js/sefaria/sefaria.js | 32 +++++++++++++------------------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/static/js/Header.jsx b/static/js/Header.jsx index ca6066ddb4..5f77a7fc69 100644 --- a/static/js/Header.jsx +++ b/static/js/Header.jsx @@ -307,7 +307,7 @@ class SearchBar extends Component { .then(d => { // If the query isn't recognized as a ref, but only for reasons of capitalization. Resubmit with recognizable caps. if (Sefaria.isACaseVariant(query, d)) { - this.submitSearch(Sefaria.titleCaseExceptStopWords(query, d)); + this.submitSearch(Sefaria.repairCaseVariant(query, d)); return; } const repairedQuery = Sefaria.repairGershayimVariant(query, d); diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js index 784b8b51ae..360a000681 100644 --- a/static/js/sefaria/sefaria.js +++ b/static/js/sefaria/sefaria.js @@ -1945,26 +1945,20 @@ _media: {}, }, repairCaseVariant: function(query, data) { // Used when isACaseVariant() is true to prepare the alternative - return data["completions"][0] + query.slice(data["completions"][0].length); - }, - titleCaseExceptStopWords: function(str) { - const stopWords = ["and", "or", "the", "a", "in", "an", "is", "of", "for"]; - let result = []; - if (str[0] === ' ') { - result.push(""); - str = str.trim(); - } - const words = str.split(' '); - for (let i = 0; i < words.length; i++) { - // title case each word except for stop words. - if (stopWords.includes(words[i])) { - result.push(words[i].replace(words[i][0], words[i][0].toLowerCase())); - } else { - result.push(words[i].replace(words[i][0], words[i][0].toUpperCase())); + const completionArray = data["completion_objects"].filter(x => x.type === 'ref').map(x => x.title); + let normalizedQuery = query.toLowerCase(); + let bestMatch = ""; + let bestMatchLength = 0; + + completionArray.forEach((completion) => { + let normalizedCompletion = completion.toLowerCase(); + if (normalizedQuery.includes(normalizedCompletion) && normalizedCompletion.length > bestMatchLength) { + bestMatch = completion; + bestMatchLength = completion.length; } - } - return result.join(' '); - }, + }); + return bestMatch + query.slice(bestMatch.length); + }, repairGershayimVariant: function(query, data) { if (!data["is_ref"] && data.completions && !data.completions.includes(query)) { function normalize_gershayim(string) { From af68e4c4f6fdfbe1debe506656598bb56e2fe18c Mon Sep 17 00:00:00 2001 From: Skyler Cohen Date: Tue, 7 Nov 2023 13:21:58 -0500 Subject: [PATCH 32/34] fix(strapi-cms): Fix allowing banners and modals to be rendered when there isn't a Hebrew localization --- static/js/Misc.jsx | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/static/js/Misc.jsx b/static/js/Misc.jsx index 7468ab72f4..c0ca0d2c96 100644 --- a/static/js/Misc.jsx +++ b/static/js/Misc.jsx @@ -2220,10 +2220,12 @@ const InterruptingMessage = ({ // Don't show the modal on pages where the button link goes to since you're already there const excludedPaths = ["/donate", "/mobile", "/app", "/ways-to-give"]; if (strapi.modal.buttonURL) { - excludedPaths.push( - new URL(strapi.modal.buttonURL.en).pathname, - new URL(strapi.modal.buttonURL.he).pathname - ); + if (strapi.modal.buttonURL.en) { + excludedPaths.push(new URL(strapi.modal.buttonURL.en).pathname); + } + if (strapi.modal.buttonURL.he) { + excludedPaths.push(new URL(strapi.modal.buttonURL.he).pathname); + } } return excludedPaths.indexOf(window.location.pathname) === -1; }; @@ -2385,10 +2387,12 @@ const Banner = ({ onClose }) => { const excludedPaths = ["/donate", "/mobile", "/app", "/ways-to-give"]; // Don't show the banner on pages where the button link goes to since you're already there if (strapi.banner.buttonURL) { - excludedPaths.push( - new URL(strapi.banner.buttonURL.en).pathname, - new URL(strapi.banner.buttonURL.he).pathname - ); + if (strapi.banner.buttonURL.en) { + excludedPaths.push(new URL(strapi.banner.buttonURL.en).pathname); + } + if (strapi.banner.buttonURL.he) { + excludedPaths.push(new URL(strapi.banner.buttonURL.he).pathname); + } } return excludedPaths.indexOf(window.location.pathname) === -1; }; From 334bc45cd3559f5ac8d781de49a6640653eaca6d Mon Sep 17 00:00:00 2001 From: Brendan Galloway Date: Wed, 8 Nov 2023 10:38:17 +0200 Subject: [PATCH 33/34] helm(fix): remove misplaced linebreak --- .../templates/configmap/create-mongo-dumps.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/helm-chart/sefaria-project/templates/configmap/create-mongo-dumps.yaml b/helm-chart/sefaria-project/templates/configmap/create-mongo-dumps.yaml index 40f79d01e5..60d126a91b 100644 --- a/helm-chart/sefaria-project/templates/configmap/create-mongo-dumps.yaml +++ b/helm-chart/sefaria-project/templates/configmap/create-mongo-dumps.yaml @@ -67,8 +67,7 @@ data: sleep 2 done - until mongodump --uri="$URI" -v -d $DATABASE --excludeCollection=history --excludeCollection=texts --excludeCollection=sheets --excludeCollection=links - --excludeCollection=django_cache --excludeCollection=user_history -o "${DATADIR}/dump" + until mongodump --uri="$URI" -v -d $DATABASE --excludeCollection=history --excludeCollection=texts --excludeCollection=sheets --excludeCollection=links --excludeCollection=django_cache --excludeCollection=user_history -o "${DATADIR}/dump" do echo "trying to dump other stuff again" sleep 2 From 964bc46510951cbc447f8f4aaad365f1496b977d Mon Sep 17 00:00:00 2001 From: Brendan Galloway Date: Wed, 8 Nov 2023 10:52:34 +0200 Subject: [PATCH 34/34] helm(fix): remove explicit ephemeral storage limits --- .../sefaria-project/templates/cronjob/mongo-backup.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml b/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml index 9c4ec49e9f..ba6f6f37d7 100644 --- a/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml +++ b/helm-chart/sefaria-project/templates/cronjob/mongo-backup.yaml @@ -58,8 +58,6 @@ spec: command: ["bash"] args: ["-c", "/scripts/create-dumps.sh"] resources: - requests: - ephemeral-storage: 30Gi limits: memory: "500Mi" containers: @@ -100,8 +98,7 @@ spec: name: upload-dumps-{{ .Values.deployEnv }} defaultMode: 0755 - name: shared-volume - emptyDir: - sizeLimit: 30Gi + emptyDir: {} successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 2 {{- end }}