-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcache.py
148 lines (122 loc) · 5.54 KB
/
cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# For caching objects in files so that handling requests is much quicker
import hashlib
import os
import logging
logger = logging.getLogger()
def stable_hash_str(key: str) -> str:
"""
A hash function that is consistent across restarts. That is of course important for file names for a cache.
:param key: string to be hashed.
:return: a 12 character hex hash string
"""
str_bytes = bytes(key, "UTF-8")
m = hashlib.md5(str_bytes)
return m.hexdigest()[:12].upper()
def file_identifier(url: str) -> str:
"""
Returns the identifying part of the file name to be used to cache data associated with a URL.
Key thing is that if the URL is for a special cornell.edu asset/catalog item then should
use the catalog identifier, something like ML928372. This way can much more easily lookup the
original data. But if not a cornell catalog item, use a hash.
:param url: The object being cached was read from
:return: the identifying part of the file name to be used to cache data for a url
"""
if url.find('cornell.edu') != -1 and url.find('/asset/') != -1:
# Special cornell URL so use the catalog number
after_asset = url[url.find('/asset/')+7:]
return 'ML' + after_asset[0:after_asset.find('/')]
else:
return stable_hash_str(url)
def proper_filename(filename):
"""
Converts the filename parameter to a proper filename by replacing spaces with '_' and removing apostrophes
:param filename:
:return: the modified filename
"""
return filename.replace(" ", "_").replace("'", "")
def get_full_filename(name, suffix='', subdir=''):
"""
Returns the full filename for the cached file. Will have suffix appended.
:param name: of the object being stored. Can be a string or a hash of URL
:param suffix: blank if specified in name. Otherwise .wav, .png, or .json, etc
:param subdir: subdirectory. Useful if want to add species. The specified name
will be processed into a proper file name (e.g. no blanks nor single quotes)
:return: the full filename of the file in the cache
"""
# Determine directory where cache stored. Thought might use tempfile.gettempdir() but that directory would
# change each time the app is run and would therefore not cache info across restarts. Therefore just
# using "/usr/local/imagerCache" even though that is not necessarily portable.
directory = '/usr/local/imagerCache/'
full_directory = directory + proper_filename(subdir) + "/"
# Make sure the directory exists
os.makedirs(full_directory, exist_ok=True)
filename = full_directory + name + suffix
return filename
def write_to_cache(data, filename, suffix='', subdir=''):
"""
Writes data to a file so that it is cached
:param data: data to be cached. Can be string or bytes. If string then is converted to bytes.
:param filename: if working with a URL should use str(cache.stable_hash(url)) as filename
:param suffix: blank if specified in name. Otherwise .wav, .png, or .json, etc
:param subdir: subdirectory. Useful if want to add species
"""
# If data is a string then convert it to bytes
if isinstance(data, str):
data = bytes(data, 'utf-8')
full_filename = get_full_filename(filename, suffix, subdir)
file = open(full_filename, 'wb')
file.write(data)
file.close()
def file_exists(filename, suffix='', subdir=''):
"""
Returns true if file exists
:param filename: if URL should use str(hash(url))
:param suffix: blank if specified in name. Otherwise .wav, .png, or .json, etc
:param subdir: subdirectory. Useful if want to add species
:return: true if file exists
"""
exists = os.path.isfile(get_full_filename(filename, suffix, subdir))
return exists
def read_from_cache(filename, suffix='', subdir=''):
"""
Reads and returns data from file.
Note: if reading json from cache and need to convert it to a Python object then use:
return json.loads(json_data, object_hook=lambda d: SimpleNamespace(**d))
:param filename: if URL should use str(hash(url))
:param suffix: blank if specified in name. Otherwise .wav, .png, or .json, etc
:param subdir: subdirectory. Useful if want to add species
:return: the str data stored in the file
"""
full_filename = get_full_filename(filename, suffix, subdir)
file = open(full_filename, 'rb')
data = file.read()
file.close()
return data
def fill_species_cache():
""""
Gets list of species and for each one determines the data for the species and caches it
into a json file for the species. Pauses 2 seconds between each species so that don't
bog everything down.
"""
from ebird import ebird
import time
logger.info("Caching all species...")
species_list = ebird.get_species_name_list()
for species_name in species_list:
logger.debug(f"Caching species: {species_name}")
ebird.get_species_info(species_name)
# Don't bog down server
time.sleep(2.0)
logger.info("Dont caching all species")
def erase_cache():
"""
Does system call to remove all the server side cache files. This way fresh
data can be generated and used. Important for if update the
supplementalSpeciesConfig.json file. Does not erase any of the wav or
image files, since those do not change and they are much more costly to
generate.
"""
logger.info("Erasing cached JSON files from the imager cache")
dir_name = get_full_filename('')
os.system(f'rm {dir_name}*Cache.json')
os.system(f'rm {dir_name}*/*Cache.json')