diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 63cc35e..0a395b6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -2,19 +2,19 @@ name: CI on: push: - branches: [master] + branches: ['*'] pull_request: - branches: [master] + branches: ['*'] jobs: build: strategy: matrix: platform: [ubuntu-latest, windows-latest] - python-version: [3.7, 3.8, 3.9, "3.10"] + python-version: [3.8, 3.9, "3.10", "3.11"] exclude: [ - {platform: windows-latest, python-version: "3.8"}, - {platform: windows-latest, python-version: "3.9"} + {platform: windows-latest, python-version: "3.9"}, + {platform: windows-latest, python-version: "3.10"} ] runs-on: ${{ matrix.platform }} diff --git a/README.md b/README.md index 5b15508..b7fb21a 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ This was extracted out of [my HPI](https://github.com/seanbreckenridge/HPI/tree/ ## Installation -Requires `python3.7+` +Requires `python3.8+` To install with pip, run: @@ -174,7 +174,6 @@ Just to give a brief overview, to add new functionality (parsing some new folder - Add a `model` for it in [`models.py`](google_takeout_parser/models.py) subclassing `BaseEvent` and adding it to the Union at the bottom of the file. That should have a `key` property function which describes each event uniquely (used to merge takeout events) - Write a function which takes the `Path` to the file you're trying to parse and converts it to the model you created (See examples in [`parse_json.py`](google_takeout_parser/parse_json.py)). Ideally extract a single raw item from the takeout file add a test for it so its obvious when/if the format changes. -- Set [the `return_type`](https://github.com/seanbreckenridge/google_takeout_parser/blob/7b1ee8ec3c3f36e6f279f20a9a214b6a3e8775f5/google_takeout_parser/parse_json.py#L71) property on the function, to use for caching/filtering - Add a regex match for the file path to the [`DEFAULT_HANDLER_MAP`](https://github.com/seanbreckenridge/google_takeout_parser/blob/2bd64b7373e4a2ac2ace32e03b25ca3b7e901034/google_takeout_parser/path_dispatch.py#L48) ### Testing diff --git a/google_takeout_parser/__init__.py b/google_takeout_parser/__init__.py index 5c05cf6..86adc69 100644 --- a/google_takeout_parser/__init__.py +++ b/google_takeout_parser/__init__.py @@ -1,10 +1,6 @@ -from pkg_resources import get_distribution, DistributionNotFound +import importlib.metadata -try: - # Change here if project is renamed and does not equal the package name - dist_name = __name__ - __version__ = get_distribution(dist_name).version -except DistributionNotFound: - __version__ = "unknown" -finally: - del get_distribution, DistributionNotFound +# Change here if project is renamed and does not equal the package name +__version__ = importlib.metadata.version(__name__) + +del importlib diff --git a/google_takeout_parser/__main__.py b/google_takeout_parser/__main__.py index 1092740..395d545 100644 --- a/google_takeout_parser/__main__.py +++ b/google_takeout_parser/__main__.py @@ -106,9 +106,9 @@ def merge(cache: bool, action: str, takeout_dir: Sequence[str]) -> None: """ from .path_dispatch import TakeoutParser from .merge import cached_merge_takeouts, merge_events - from .models import DEFAULT_MODEL_TYPE + from .models import DEFAULT_MODEL_TYPE, Res - res: List[DEFAULT_MODEL_TYPE] = [] + res: List[Res[DEFAULT_MODEL_TYPE]] = [] if cache: res = list(cached_merge_takeouts(list(takeout_dir))) else: diff --git a/google_takeout_parser/compat.py b/google_takeout_parser/compat.py deleted file mode 100644 index 2c7eb03..0000000 --- a/google_takeout_parser/compat.py +++ /dev/null @@ -1,8 +0,0 @@ -import sys - -# from https://github.com/karlicoss/HPI/blob/master/my/core/compat.py - -if sys.version_info[:2] >= (3, 8): - from typing import Literal -else: - from typing_extensions import Literal # noqa: F401 diff --git a/google_takeout_parser/merge.py b/google_takeout_parser/merge.py index 69c0b04..d2f05f6 100644 --- a/google_takeout_parser/merge.py +++ b/google_takeout_parser/merge.py @@ -23,8 +23,8 @@ # Note: only used for this module, HPI caches elsewhere @cachew( - cache_path=lambda _: str(takeout_cache_path / "_merged_takeouts"), - depends_on=lambda pths: list(sorted([str(p) for p in pths])), + cache_path=str(takeout_cache_path / "_merged_takeouts"), + depends_on=lambda tp: str(list(sorted(str(p) for p in tp))), force_file=True, logger=logger, ) diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py index e332274..58ddd6d 100644 --- a/google_takeout_parser/models.py +++ b/google_takeout_parser/models.py @@ -7,7 +7,7 @@ from __future__ import annotations from datetime import datetime -from typing import Optional, List, Tuple, Any, Union, Iterator, TYPE_CHECKING, Dict +from typing import Optional, List, Tuple, Any, Union, Iterator, Dict, Protocol from dataclasses import dataclass from .common import Res @@ -26,14 +26,6 @@ # name, url Subtitles = Tuple[str, MetaData] -if TYPE_CHECKING: - try: - from typing import Protocol - except ImportError: - from typing_extensions import Protocol # type: ignore -else: - Protocol = object - class BaseEvent(Protocol): @property @@ -107,11 +99,11 @@ def key(self) -> int: class Location(BaseEvent): lat: float lng: float - accuracy: Optional[int] + accuracy: Optional[float] dt: datetime @property - def key(self) -> Tuple[float, float, Optional[int], int]: + def key(self) -> Tuple[float, float, Optional[float], int]: return self.lat, self.lng, self.accuracy, int(self.dt.timestamp()) diff --git a/google_takeout_parser/parse_html/activity.py b/google_takeout_parser/parse_html/activity.py index b2e2116..274cbca 100644 --- a/google_takeout_parser/parse_html/activity.py +++ b/google_takeout_parser/parse_html/activity.py @@ -337,6 +337,3 @@ def _parse_html_activity(p: Path) -> Iterator[Res[Activity]]: yield _parse_activity_div(outer_div, file_dt=file_dt) except Exception as ae: yield ae - - -_parse_html_activity.return_type = Activity # type: ignore[attr-defined] diff --git a/google_takeout_parser/parse_html/comment.py b/google_takeout_parser/parse_html/comment.py index 0a591e6..a3e2a28 100644 --- a/google_takeout_parser/parse_html/comment.py +++ b/google_takeout_parser/parse_html/comment.py @@ -60,9 +60,6 @@ def _parse_html_comment_file(p: Path) -> Iterator[Res[YoutubeComment]]: yield e -_parse_html_comment_file.return_type = YoutubeComment # type: ignore[attr-defined] - - def test_parse_html_comment_file() -> None: li_obj = bs4.BeautifulSoup( """""", diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py index 6f30a03..af011a5 100644 --- a/google_takeout_parser/parse_json.py +++ b/google_takeout_parser/parse_json.py @@ -70,9 +70,6 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]: yield e -_parse_json_activity.return_type = Activity # type: ignore[attr-defined] - - def _parse_likes(p: Path) -> Iterator[Res[LikedYoutubeVideo]]: json_data = json.loads(p.read_text()) if not isinstance(json_data, list): @@ -91,9 +88,6 @@ def _parse_likes(p: Path) -> Iterator[Res[LikedYoutubeVideo]]: yield e -_parse_likes.return_type = LikedYoutubeVideo # type: ignore[attr-defined] - - def _parse_app_installs(p: Path) -> Iterator[Res[PlayStoreAppInstall]]: json_data = json.loads(p.read_text()) if not isinstance(json_data, list): @@ -109,9 +103,6 @@ def _parse_app_installs(p: Path) -> Iterator[Res[PlayStoreAppInstall]]: yield e -_parse_app_installs.return_type = PlayStoreAppInstall # type: ignore[attr-defined] - - def _parse_timestamp_key(d: Dict[str, Any], key: str) -> datetime: if f"{key}Ms" in d: return parse_datetime_millis(d[f"{key}Ms"]) @@ -137,14 +128,12 @@ def _parse_location_history(p: Path) -> Iterator[Res[Location]]: lng=float(loc["longitudeE7"]) / 1e7, lat=float(loc["latitudeE7"]) / 1e7, dt=_parse_location_timestamp(loc), - accuracy=None if accuracy is None else int(accuracy), + accuracy=None if accuracy is None else float(accuracy), ) except Exception as e: yield e -_parse_location_history.return_type = Location # type: ignore[attr-defined] - _sem_required_keys = ["location", "duration"] @@ -209,9 +198,6 @@ def _parse_semantic_location_history(p: Path) -> Iterator[Res[PlaceVisit]]: yield e -_parse_semantic_location_history.return_type = PlaceVisit # type: ignore[attr-defined] - - def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]: json_data = json.loads(p.read_text()) if "Browser History" not in json_data: @@ -226,6 +212,3 @@ def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]: ) except Exception as e: yield e - - -_parse_chrome_history.return_type = ChromeHistory # type: ignore[attr-defined] diff --git a/google_takeout_parser/path_dispatch.py b/google_takeout_parser/path_dispatch.py index a60526b..4da5de5 100644 --- a/google_takeout_parser/path_dispatch.py +++ b/google_takeout_parser/path_dispatch.py @@ -9,13 +9,14 @@ from typing import ( Iterator, Dict, + Union, Callable, Any, Optional, List, Type, Tuple, - cast, + Literal, ) from collections import defaultdict @@ -23,7 +24,6 @@ from cachew import cachew from . import __version__ as _google_takeout_version -from .compat import Literal from .common import Res, PathIsh from .cache import takeout_cache_path from .log import logger @@ -47,22 +47,68 @@ HandlerFunction = Callable[[Path], BaseResults] HandlerMap = Dict[str, Optional[HandlerFunction]] -_CacheKeySingle = Type[BaseEvent] -CacheKey = _CacheKeySingle +CacheKey = Tuple[Type[BaseEvent], ...] def _cache_key_to_str(c: CacheKey) -> str: - return str(c.__name__).casefold() + """Convert a cache key to a string""" + return "_".join(sorted(p.__name__ for p in c)).casefold() -def _parse_handler_return_type(handler: HandlerFunction) -> CacheKey: - assert hasattr( - handler, "return_type" - ), f"Handler functions should have an 'return_type' property which specifies what types this produces. See parse_json.py for an example. No 'return_type' on {handler}" - val: Any = getattr(handler, "return_type") - assert isinstance(val, type), f"{val} is not a type" - assert BaseEvent in val.__mro__, f"{val} not a subclass of BaseEvent" - return cast(_CacheKeySingle, val) +def _handler_type_cache_key(handler: HandlerFunction) -> CacheKey: + # Take a function like Iterator[Union[Item, Exception]] and return Item + + import inspect + from cachew.legacy import get_union_args + + sig = inspect.signature(handler) + + # get the return type of the function + # e.g. Iterator[Union[Item, Exception]] + return_type = sig.return_annotation + + # this must have a return type + if return_type == inspect.Signature.empty: + raise TypeError(f"Could not get return type for {handler.__name__}") + + # remove top-level iterator if it has it + if return_type._name == "Iterator": + return_type = return_type.__args__[0] + + args: Optional[Tuple[Type]] = get_union_args(return_type) # type: ignore[type-arg] + if args is None: + raise TypeError( + f"Could not get union args for {return_type} in {handler.__name__}" + ) + + # remove exceptions + t_args = tuple(t for t in args if t != Exception) + + for t in t_args: + if BaseEvent not in t.__mro__: + raise TypeError( + f"Return type {t} from {return_type} of {handler.__name__} does not contain BaseEvent" + ) + if t == BaseEvent: + raise TypeError( + f"Return type {t} from {return_type} of {handler.__name__} is BaseEvent, which is not allowed" + ) + + return tuple(t_args) + + +def _cache_key_to_type(c: CacheKey) -> Any: + """ + If theres one item in the cache key, return that + If theres multiple, return a Union of them + """ + assert len(c) > 0 + if len(c) == 1: + return c[0] + else: + assert isinstance(c, tuple) + + return Union[c] # type: ignore[valid-type] # If parsed, should mention: @@ -285,7 +331,7 @@ def _log_handler(self, path: Path, handler: Any) -> None: def _parse_raw(self, filter_type: Optional[Type[BaseEvent]] = None) -> BaseResults: """Parse the takeout with no cache. If a filter is specified, only parses those files""" handlers = self._group_by_return_type(filter_type=filter_type) - for cache_key, result_tuples in handlers.items(): + for _, result_tuples in handlers.items(): for path, itr in result_tuples: self._log_handler(path, itr) yield from itr @@ -339,9 +385,9 @@ def _group_by_return_type( """ handlers: Dict[CacheKey, List[Tuple[Path, BaseResults]]] = defaultdict(list) for path, handler in self.dispatch_map().items(): - ckey: CacheKey = _parse_handler_return_type(handler) + ckey: CacheKey = _handler_type_cache_key(handler) # don't include in the result if we're filtering to a specific type - if filter_type is not None and ckey != filter_type: + if filter_type is not None and filter_type not in ckey: logger.debug( f"Provided '{filter_type}' as filter, '{ckey}' doesn't match, ignoring '{path}'..." ) @@ -381,14 +427,9 @@ def _cached_parse( ) -> BaseResults: handlers = self._group_by_return_type(filter_type=filter_type) for cache_key, result_tuples in handlers.items(): - # Hmm -- I think this should work with CacheKeys that have multiple - # types but it may fail -- need to check if one is added - # - # create a function which groups the iterators for this return type - # that all gets stored in one database - # - # the return type here is purely for cachew, so it can infer the type - def _func() -> Iterator[Res[cache_key]]: # type: ignore[valid-type] + _ret_type: Any = _cache_key_to_type(cache_key) + + def _func() -> Iterator[Res[_ret_type]]: # type: ignore[valid-type] for path, itr in result_tuples: self._log_handler(path, itr) yield from itr diff --git a/setup.cfg b/setup.cfg index 090b7e6..b2bd51d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,10 +1,66 @@ +[metadata] +name = google_takeout_parser +version = 0.1.4 +description = Parses data out of your Google Takeout (History, Activity, Youtube, Locations, etc...) +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/seanbreckenridge/google_takeout_parser +author = Sean Breckenridge +author_email = "seanbrecke@gmail.com" +license = MIT +license_files = LICENSE +classifiers = + License :: OSI Approved :: MIT License + Programming Language :: Python + Programming Language :: Python :: 3 + Programming Language :: Python :: 3 :: Only + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 +keywords = google data parsing + +[options] +packages = find: +install_requires = + IPython + beautifulsoup4>=4.9.0 + cachew>=0.14.20230922 + click>=8.1 + logzero>=1.7.0 + lxml>=4.6.0 + platformdirs>=2.3.0 + pytz>=2021.3 +python_requires = >=3.8 +include_package_data = True + +[options.packages.find] +exclude = + tests* +include = + google_takeout_parser + google_takeout_parser.parse_html + +[options.entry_points] +console_scripts = + google_takeout_parser = google_takeout_parser.__main__:main + +[options.extras_require] +testing = + flake8 + mypy + pytest + +[options.package_data] +google_takeout_parser = py.typed + [flake8] -ignore=E501,E402,W503,E266,E203 +ignore = E501,E402,W503,E266,E203 [mypy] pretty = True show_error_context = True -show_error_codes = True +show_error_codes = True check_untyped_defs = True namespace_packages = True disallow_any_generics = True @@ -19,7 +75,6 @@ warn_unreachable = True [tool:pytest] addopts = - --doctest-modules google_takeout_parser - -vv - ./tests/ - + --doctest-modules google_takeout_parser + -vv + ./tests/ diff --git a/setup.py b/setup.py index d459510..7f1a176 100644 --- a/setup.py +++ b/setup.py @@ -1,52 +1,4 @@ -from pathlib import Path -from setuptools import setup, find_packages +from setuptools import setup -long_description = Path("README.md").read_text() -reqs = Path("requirements.txt").read_text().strip().splitlines() - -pkg = "google_takeout_parser" -setup( - name=pkg, - version="0.1.3", - url="https://github.com/seanbreckenridge/google_takeout_parser", - author="Sean Breckenridge", - author_email="seanbrecke@gmail.com", - description=( - """Parses data out of your Google Takeout (History, Activity, Youtube, Locations, etc...)""" - ), - long_description=long_description, - long_description_content_type="text/markdown", - license="MIT", - packages=find_packages( - include=["google_takeout_parser", "google_takeout_parser.parse_html"] - ), - install_requires=reqs, - package_data={pkg: ["py.typed"]}, - zip_safe=False, - keywords="google data parsing", - python_requires=">=3.7", - entry_points={ - "console_scripts": [ - "google_takeout_parser = google_takeout_parser.__main__:main" - ] - }, - extras_require={ - "testing": [ - "pytest", - "mypy", - "flake8", - ], - ':python_version<"3.7"': [ - "typing_extensions", - ], - }, - classifiers=[ - "License :: OSI Approved :: MIT License", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - ], -) +if __name__ == "__main__": + setup() diff --git a/tests/test_json.py b/tests/test_json.py index 80bf494..5d1bbd1 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -110,7 +110,7 @@ def test_location_old(tmp_path_f) -> None: dt=datetime.datetime( 2017, 12, 10, 23, 14, 58, tzinfo=datetime.timezone.utc ), - accuracy=10, + accuracy=10.0, ), ] @@ -127,7 +127,7 @@ def test_location_new(tmp_path_f: Path) -> None: dt=datetime.datetime( 2017, 12, 10, 23, 14, 58, 30000, tzinfo=datetime.timezone.utc ), - accuracy=10, + accuracy=10.0, ), ] diff --git a/tests/test_types.py b/tests/test_types.py index 91b9187..696ef73 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -1,14 +1,14 @@ import inspect import google_takeout_parser.models as mod -from cachew import get_union_args +from cachew.legacy import get_union_args def test_check_union() -> None: """ Makes sure that any classes defined in models are included in the union type - sanity check test to ensure cachew doesnt fail with difficult to debug union/errors + sanity check test to ensure cachew doesn't fail with difficult to debug union/errors """ classes = { @@ -21,3 +21,34 @@ def test_check_union() -> None: union_args = set(ua) assert union_args == classes + + +def test_parsing_return_type() -> None: + from typing import Iterator, Union + from pathlib import Path + from google_takeout_parser.path_dispatch import ( + _cache_key_to_str, + _cache_key_to_type, + _handler_type_cache_key, + ) + from google_takeout_parser.models import Activity, Res, PlayStoreAppInstall + + def _test_func(path: Path) -> Iterator[Res[Activity]]: + yield Exception("test") + + ret_type = _handler_type_cache_key(_test_func) + assert ret_type is not None + assert ret_type == (Activity,) + assert _cache_key_to_str(ret_type) == "activity" + assert _cache_key_to_type(ret_type) == Activity + + def _test_multiple( + path: Path, + ) -> Iterator[Res[Union[Activity, PlayStoreAppInstall]]]: + yield Exception("test") + + ret_type = _handler_type_cache_key(_test_multiple) + assert ret_type is not None + assert ret_type == (Activity, PlayStoreAppInstall) + assert _cache_key_to_str(ret_type) == "activity_playstoreappinstall" + assert _cache_key_to_type(ret_type) == Union[Activity, PlayStoreAppInstall]