diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 63cc35e..0a395b6 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -2,19 +2,19 @@ name: CI
on:
push:
- branches: [master]
+ branches: ['*']
pull_request:
- branches: [master]
+ branches: ['*']
jobs:
build:
strategy:
matrix:
platform: [ubuntu-latest, windows-latest]
- python-version: [3.7, 3.8, 3.9, "3.10"]
+ python-version: [3.8, 3.9, "3.10", "3.11"]
exclude: [
- {platform: windows-latest, python-version: "3.8"},
- {platform: windows-latest, python-version: "3.9"}
+ {platform: windows-latest, python-version: "3.9"},
+ {platform: windows-latest, python-version: "3.10"}
]
runs-on: ${{ matrix.platform }}
diff --git a/README.md b/README.md
index 5b15508..b7fb21a 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ This was extracted out of [my HPI](https://github.com/seanbreckenridge/HPI/tree/
## Installation
-Requires `python3.7+`
+Requires `python3.8+`
To install with pip, run:
@@ -174,7 +174,6 @@ Just to give a brief overview, to add new functionality (parsing some new folder
- Add a `model` for it in [`models.py`](google_takeout_parser/models.py) subclassing `BaseEvent` and adding it to the Union at the bottom of the file. That should have a `key` property function which describes each event uniquely (used to merge takeout events)
- Write a function which takes the `Path` to the file you're trying to parse and converts it to the model you created (See examples in [`parse_json.py`](google_takeout_parser/parse_json.py)). Ideally extract a single raw item from the takeout file add a test for it so its obvious when/if the format changes.
-- Set [the `return_type`](https://github.com/seanbreckenridge/google_takeout_parser/blob/7b1ee8ec3c3f36e6f279f20a9a214b6a3e8775f5/google_takeout_parser/parse_json.py#L71) property on the function, to use for caching/filtering
- Add a regex match for the file path to the [`DEFAULT_HANDLER_MAP`](https://github.com/seanbreckenridge/google_takeout_parser/blob/2bd64b7373e4a2ac2ace32e03b25ca3b7e901034/google_takeout_parser/path_dispatch.py#L48)
### Testing
diff --git a/google_takeout_parser/__init__.py b/google_takeout_parser/__init__.py
index 5c05cf6..86adc69 100644
--- a/google_takeout_parser/__init__.py
+++ b/google_takeout_parser/__init__.py
@@ -1,10 +1,6 @@
-from pkg_resources import get_distribution, DistributionNotFound
+import importlib.metadata
-try:
- # Change here if project is renamed and does not equal the package name
- dist_name = __name__
- __version__ = get_distribution(dist_name).version
-except DistributionNotFound:
- __version__ = "unknown"
-finally:
- del get_distribution, DistributionNotFound
+# Change here if project is renamed and does not equal the package name
+__version__ = importlib.metadata.version(__name__)
+
+del importlib
diff --git a/google_takeout_parser/__main__.py b/google_takeout_parser/__main__.py
index 1092740..395d545 100644
--- a/google_takeout_parser/__main__.py
+++ b/google_takeout_parser/__main__.py
@@ -106,9 +106,9 @@ def merge(cache: bool, action: str, takeout_dir: Sequence[str]) -> None:
"""
from .path_dispatch import TakeoutParser
from .merge import cached_merge_takeouts, merge_events
- from .models import DEFAULT_MODEL_TYPE
+ from .models import DEFAULT_MODEL_TYPE, Res
- res: List[DEFAULT_MODEL_TYPE] = []
+ res: List[Res[DEFAULT_MODEL_TYPE]] = []
if cache:
res = list(cached_merge_takeouts(list(takeout_dir)))
else:
diff --git a/google_takeout_parser/compat.py b/google_takeout_parser/compat.py
deleted file mode 100644
index 2c7eb03..0000000
--- a/google_takeout_parser/compat.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import sys
-
-# from https://github.com/karlicoss/HPI/blob/master/my/core/compat.py
-
-if sys.version_info[:2] >= (3, 8):
- from typing import Literal
-else:
- from typing_extensions import Literal # noqa: F401
diff --git a/google_takeout_parser/merge.py b/google_takeout_parser/merge.py
index 69c0b04..d2f05f6 100644
--- a/google_takeout_parser/merge.py
+++ b/google_takeout_parser/merge.py
@@ -23,8 +23,8 @@
# Note: only used for this module, HPI caches elsewhere
@cachew(
- cache_path=lambda _: str(takeout_cache_path / "_merged_takeouts"),
- depends_on=lambda pths: list(sorted([str(p) for p in pths])),
+ cache_path=str(takeout_cache_path / "_merged_takeouts"),
+ depends_on=lambda tp: str(list(sorted(str(p) for p in tp))),
force_file=True,
logger=logger,
)
diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py
index e332274..58ddd6d 100644
--- a/google_takeout_parser/models.py
+++ b/google_takeout_parser/models.py
@@ -7,7 +7,7 @@
from __future__ import annotations
from datetime import datetime
-from typing import Optional, List, Tuple, Any, Union, Iterator, TYPE_CHECKING, Dict
+from typing import Optional, List, Tuple, Any, Union, Iterator, Dict, Protocol
from dataclasses import dataclass
from .common import Res
@@ -26,14 +26,6 @@
# name, url
Subtitles = Tuple[str, MetaData]
-if TYPE_CHECKING:
- try:
- from typing import Protocol
- except ImportError:
- from typing_extensions import Protocol # type: ignore
-else:
- Protocol = object
-
class BaseEvent(Protocol):
@property
@@ -107,11 +99,11 @@ def key(self) -> int:
class Location(BaseEvent):
lat: float
lng: float
- accuracy: Optional[int]
+ accuracy: Optional[float]
dt: datetime
@property
- def key(self) -> Tuple[float, float, Optional[int], int]:
+ def key(self) -> Tuple[float, float, Optional[float], int]:
return self.lat, self.lng, self.accuracy, int(self.dt.timestamp())
diff --git a/google_takeout_parser/parse_html/activity.py b/google_takeout_parser/parse_html/activity.py
index b2e2116..274cbca 100644
--- a/google_takeout_parser/parse_html/activity.py
+++ b/google_takeout_parser/parse_html/activity.py
@@ -337,6 +337,3 @@ def _parse_html_activity(p: Path) -> Iterator[Res[Activity]]:
yield _parse_activity_div(outer_div, file_dt=file_dt)
except Exception as ae:
yield ae
-
-
-_parse_html_activity.return_type = Activity # type: ignore[attr-defined]
diff --git a/google_takeout_parser/parse_html/comment.py b/google_takeout_parser/parse_html/comment.py
index 0a591e6..a3e2a28 100644
--- a/google_takeout_parser/parse_html/comment.py
+++ b/google_takeout_parser/parse_html/comment.py
@@ -60,9 +60,6 @@ def _parse_html_comment_file(p: Path) -> Iterator[Res[YoutubeComment]]:
yield e
-_parse_html_comment_file.return_type = YoutubeComment # type: ignore[attr-defined]
-
-
def test_parse_html_comment_file() -> None:
li_obj = bs4.BeautifulSoup(
"""
- Sent at 2020-04-27 23:18:23 UTC while watching a video.
content here
""",
diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py
index 6f30a03..af011a5 100644
--- a/google_takeout_parser/parse_json.py
+++ b/google_takeout_parser/parse_json.py
@@ -70,9 +70,6 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]:
yield e
-_parse_json_activity.return_type = Activity # type: ignore[attr-defined]
-
-
def _parse_likes(p: Path) -> Iterator[Res[LikedYoutubeVideo]]:
json_data = json.loads(p.read_text())
if not isinstance(json_data, list):
@@ -91,9 +88,6 @@ def _parse_likes(p: Path) -> Iterator[Res[LikedYoutubeVideo]]:
yield e
-_parse_likes.return_type = LikedYoutubeVideo # type: ignore[attr-defined]
-
-
def _parse_app_installs(p: Path) -> Iterator[Res[PlayStoreAppInstall]]:
json_data = json.loads(p.read_text())
if not isinstance(json_data, list):
@@ -109,9 +103,6 @@ def _parse_app_installs(p: Path) -> Iterator[Res[PlayStoreAppInstall]]:
yield e
-_parse_app_installs.return_type = PlayStoreAppInstall # type: ignore[attr-defined]
-
-
def _parse_timestamp_key(d: Dict[str, Any], key: str) -> datetime:
if f"{key}Ms" in d:
return parse_datetime_millis(d[f"{key}Ms"])
@@ -137,14 +128,12 @@ def _parse_location_history(p: Path) -> Iterator[Res[Location]]:
lng=float(loc["longitudeE7"]) / 1e7,
lat=float(loc["latitudeE7"]) / 1e7,
dt=_parse_location_timestamp(loc),
- accuracy=None if accuracy is None else int(accuracy),
+ accuracy=None if accuracy is None else float(accuracy),
)
except Exception as e:
yield e
-_parse_location_history.return_type = Location # type: ignore[attr-defined]
-
_sem_required_keys = ["location", "duration"]
@@ -209,9 +198,6 @@ def _parse_semantic_location_history(p: Path) -> Iterator[Res[PlaceVisit]]:
yield e
-_parse_semantic_location_history.return_type = PlaceVisit # type: ignore[attr-defined]
-
-
def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]:
json_data = json.loads(p.read_text())
if "Browser History" not in json_data:
@@ -226,6 +212,3 @@ def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]:
)
except Exception as e:
yield e
-
-
-_parse_chrome_history.return_type = ChromeHistory # type: ignore[attr-defined]
diff --git a/google_takeout_parser/path_dispatch.py b/google_takeout_parser/path_dispatch.py
index a60526b..4da5de5 100644
--- a/google_takeout_parser/path_dispatch.py
+++ b/google_takeout_parser/path_dispatch.py
@@ -9,13 +9,14 @@
from typing import (
Iterator,
Dict,
+ Union,
Callable,
Any,
Optional,
List,
Type,
Tuple,
- cast,
+ Literal,
)
from collections import defaultdict
@@ -23,7 +24,6 @@
from cachew import cachew
from . import __version__ as _google_takeout_version
-from .compat import Literal
from .common import Res, PathIsh
from .cache import takeout_cache_path
from .log import logger
@@ -47,22 +47,68 @@
HandlerFunction = Callable[[Path], BaseResults]
HandlerMap = Dict[str, Optional[HandlerFunction]]
-_CacheKeySingle = Type[BaseEvent]
-CacheKey = _CacheKeySingle
+CacheKey = Tuple[Type[BaseEvent], ...]
def _cache_key_to_str(c: CacheKey) -> str:
- return str(c.__name__).casefold()
+ """Convert a cache key to a string"""
+ return "_".join(sorted(p.__name__ for p in c)).casefold()
-def _parse_handler_return_type(handler: HandlerFunction) -> CacheKey:
- assert hasattr(
- handler, "return_type"
- ), f"Handler functions should have an 'return_type' property which specifies what types this produces. See parse_json.py for an example. No 'return_type' on {handler}"
- val: Any = getattr(handler, "return_type")
- assert isinstance(val, type), f"{val} is not a type"
- assert BaseEvent in val.__mro__, f"{val} not a subclass of BaseEvent"
- return cast(_CacheKeySingle, val)
+def _handler_type_cache_key(handler: HandlerFunction) -> CacheKey:
+ # Take a function like Iterator[Union[Item, Exception]] and return Item
+
+ import inspect
+ from cachew.legacy import get_union_args
+
+ sig = inspect.signature(handler)
+
+ # get the return type of the function
+ # e.g. Iterator[Union[Item, Exception]]
+ return_type = sig.return_annotation
+
+ # this must have a return type
+ if return_type == inspect.Signature.empty:
+ raise TypeError(f"Could not get return type for {handler.__name__}")
+
+ # remove top-level iterator if it has it
+ if return_type._name == "Iterator":
+ return_type = return_type.__args__[0]
+
+ args: Optional[Tuple[Type]] = get_union_args(return_type) # type: ignore[type-arg]
+ if args is None:
+ raise TypeError(
+ f"Could not get union args for {return_type} in {handler.__name__}"
+ )
+
+ # remove exceptions
+ t_args = tuple(t for t in args if t != Exception)
+
+ for t in t_args:
+ if BaseEvent not in t.__mro__:
+ raise TypeError(
+ f"Return type {t} from {return_type} of {handler.__name__} does not contain BaseEvent"
+ )
+ if t == BaseEvent:
+ raise TypeError(
+ f"Return type {t} from {return_type} of {handler.__name__} is BaseEvent, which is not allowed"
+ )
+
+ return tuple(t_args)
+
+
+def _cache_key_to_type(c: CacheKey) -> Any:
+ """
+ If theres one item in the cache key, return that
+ If theres multiple, return a Union of them
+ """
+ assert len(c) > 0
+ if len(c) == 1:
+ return c[0]
+ else:
+ assert isinstance(c, tuple)
+
+ return Union[c] # type: ignore[valid-type]
# If parsed, should mention:
@@ -285,7 +331,7 @@ def _log_handler(self, path: Path, handler: Any) -> None:
def _parse_raw(self, filter_type: Optional[Type[BaseEvent]] = None) -> BaseResults:
"""Parse the takeout with no cache. If a filter is specified, only parses those files"""
handlers = self._group_by_return_type(filter_type=filter_type)
- for cache_key, result_tuples in handlers.items():
+ for _, result_tuples in handlers.items():
for path, itr in result_tuples:
self._log_handler(path, itr)
yield from itr
@@ -339,9 +385,9 @@ def _group_by_return_type(
"""
handlers: Dict[CacheKey, List[Tuple[Path, BaseResults]]] = defaultdict(list)
for path, handler in self.dispatch_map().items():
- ckey: CacheKey = _parse_handler_return_type(handler)
+ ckey: CacheKey = _handler_type_cache_key(handler)
# don't include in the result if we're filtering to a specific type
- if filter_type is not None and ckey != filter_type:
+ if filter_type is not None and filter_type not in ckey:
logger.debug(
f"Provided '{filter_type}' as filter, '{ckey}' doesn't match, ignoring '{path}'..."
)
@@ -381,14 +427,9 @@ def _cached_parse(
) -> BaseResults:
handlers = self._group_by_return_type(filter_type=filter_type)
for cache_key, result_tuples in handlers.items():
- # Hmm -- I think this should work with CacheKeys that have multiple
- # types but it may fail -- need to check if one is added
- #
- # create a function which groups the iterators for this return type
- # that all gets stored in one database
- #
- # the return type here is purely for cachew, so it can infer the type
- def _func() -> Iterator[Res[cache_key]]: # type: ignore[valid-type]
+ _ret_type: Any = _cache_key_to_type(cache_key)
+
+ def _func() -> Iterator[Res[_ret_type]]: # type: ignore[valid-type]
for path, itr in result_tuples:
self._log_handler(path, itr)
yield from itr
diff --git a/setup.cfg b/setup.cfg
index 090b7e6..b2bd51d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,10 +1,66 @@
+[metadata]
+name = google_takeout_parser
+version = 0.1.4
+description = Parses data out of your Google Takeout (History, Activity, Youtube, Locations, etc...)
+long_description = file: README.md
+long_description_content_type = text/markdown
+url = https://github.com/seanbreckenridge/google_takeout_parser
+author = Sean Breckenridge
+author_email = "seanbrecke@gmail.com"
+license = MIT
+license_files = LICENSE
+classifiers =
+ License :: OSI Approved :: MIT License
+ Programming Language :: Python
+ Programming Language :: Python :: 3
+ Programming Language :: Python :: 3 :: Only
+ Programming Language :: Python :: 3.8
+ Programming Language :: Python :: 3.9
+ Programming Language :: Python :: 3.10
+ Programming Language :: Python :: 3.11
+keywords = google data parsing
+
+[options]
+packages = find:
+install_requires =
+ IPython
+ beautifulsoup4>=4.9.0
+ cachew>=0.14.20230922
+ click>=8.1
+ logzero>=1.7.0
+ lxml>=4.6.0
+ platformdirs>=2.3.0
+ pytz>=2021.3
+python_requires = >=3.8
+include_package_data = True
+
+[options.packages.find]
+exclude =
+ tests*
+include =
+ google_takeout_parser
+ google_takeout_parser.parse_html
+
+[options.entry_points]
+console_scripts =
+ google_takeout_parser = google_takeout_parser.__main__:main
+
+[options.extras_require]
+testing =
+ flake8
+ mypy
+ pytest
+
+[options.package_data]
+google_takeout_parser = py.typed
+
[flake8]
-ignore=E501,E402,W503,E266,E203
+ignore = E501,E402,W503,E266,E203
[mypy]
pretty = True
show_error_context = True
-show_error_codes = True
+show_error_codes = True
check_untyped_defs = True
namespace_packages = True
disallow_any_generics = True
@@ -19,7 +75,6 @@ warn_unreachable = True
[tool:pytest]
addopts =
- --doctest-modules google_takeout_parser
- -vv
- ./tests/
-
+ --doctest-modules google_takeout_parser
+ -vv
+ ./tests/
diff --git a/setup.py b/setup.py
index d459510..7f1a176 100644
--- a/setup.py
+++ b/setup.py
@@ -1,52 +1,4 @@
-from pathlib import Path
-from setuptools import setup, find_packages
+from setuptools import setup
-long_description = Path("README.md").read_text()
-reqs = Path("requirements.txt").read_text().strip().splitlines()
-
-pkg = "google_takeout_parser"
-setup(
- name=pkg,
- version="0.1.3",
- url="https://github.com/seanbreckenridge/google_takeout_parser",
- author="Sean Breckenridge",
- author_email="seanbrecke@gmail.com",
- description=(
- """Parses data out of your Google Takeout (History, Activity, Youtube, Locations, etc...)"""
- ),
- long_description=long_description,
- long_description_content_type="text/markdown",
- license="MIT",
- packages=find_packages(
- include=["google_takeout_parser", "google_takeout_parser.parse_html"]
- ),
- install_requires=reqs,
- package_data={pkg: ["py.typed"]},
- zip_safe=False,
- keywords="google data parsing",
- python_requires=">=3.7",
- entry_points={
- "console_scripts": [
- "google_takeout_parser = google_takeout_parser.__main__:main"
- ]
- },
- extras_require={
- "testing": [
- "pytest",
- "mypy",
- "flake8",
- ],
- ':python_version<"3.7"': [
- "typing_extensions",
- ],
- },
- classifiers=[
- "License :: OSI Approved :: MIT License",
- "Programming Language :: Python",
- "Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.7",
- "Programming Language :: Python :: 3.8",
- "Programming Language :: Python :: 3.9",
- "Programming Language :: Python :: 3.10",
- ],
-)
+if __name__ == "__main__":
+ setup()
diff --git a/tests/test_json.py b/tests/test_json.py
index 80bf494..5d1bbd1 100644
--- a/tests/test_json.py
+++ b/tests/test_json.py
@@ -110,7 +110,7 @@ def test_location_old(tmp_path_f) -> None:
dt=datetime.datetime(
2017, 12, 10, 23, 14, 58, tzinfo=datetime.timezone.utc
),
- accuracy=10,
+ accuracy=10.0,
),
]
@@ -127,7 +127,7 @@ def test_location_new(tmp_path_f: Path) -> None:
dt=datetime.datetime(
2017, 12, 10, 23, 14, 58, 30000, tzinfo=datetime.timezone.utc
),
- accuracy=10,
+ accuracy=10.0,
),
]
diff --git a/tests/test_types.py b/tests/test_types.py
index 91b9187..696ef73 100644
--- a/tests/test_types.py
+++ b/tests/test_types.py
@@ -1,14 +1,14 @@
import inspect
import google_takeout_parser.models as mod
-from cachew import get_union_args
+from cachew.legacy import get_union_args
def test_check_union() -> None:
"""
Makes sure that any classes defined in models are included in the union type
- sanity check test to ensure cachew doesnt fail with difficult to debug union/errors
+ sanity check test to ensure cachew doesn't fail with difficult to debug union/errors
"""
classes = {
@@ -21,3 +21,34 @@ def test_check_union() -> None:
union_args = set(ua)
assert union_args == classes
+
+
+def test_parsing_return_type() -> None:
+ from typing import Iterator, Union
+ from pathlib import Path
+ from google_takeout_parser.path_dispatch import (
+ _cache_key_to_str,
+ _cache_key_to_type,
+ _handler_type_cache_key,
+ )
+ from google_takeout_parser.models import Activity, Res, PlayStoreAppInstall
+
+ def _test_func(path: Path) -> Iterator[Res[Activity]]:
+ yield Exception("test")
+
+ ret_type = _handler_type_cache_key(_test_func)
+ assert ret_type is not None
+ assert ret_type == (Activity,)
+ assert _cache_key_to_str(ret_type) == "activity"
+ assert _cache_key_to_type(ret_type) == Activity
+
+ def _test_multiple(
+ path: Path,
+ ) -> Iterator[Res[Union[Activity, PlayStoreAppInstall]]]:
+ yield Exception("test")
+
+ ret_type = _handler_type_cache_key(_test_multiple)
+ assert ret_type is not None
+ assert ret_type == (Activity, PlayStoreAppInstall)
+ assert _cache_key_to_str(ret_type) == "activity_playstoreappinstall"
+ assert _cache_key_to_type(ret_type) == Union[Activity, PlayStoreAppInstall]