-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmissing_translation_manager.py
112 lines (95 loc) · 3.79 KB
/
missing_translation_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import sys
from csv import DictReader
from api.dictionary.exceptions.http import WordAlreadyExists
from api.servicemanager import DictionaryServiceManager
from api.storage import MissingTranslationCsvWriter
class Extractor(object):
def __init__(self, language):
self.language = language
self.writer = MissingTranslationCsvWriter(self.language)
def start(self):
self.writer.to_csv()
class Loader(object):
"""
Loads CSV file containing referenced English words into database through
the dictionary REST service.
The CSV is expected to have a header. Expected CSV column names are:
- english (column #1): the english word
- hits (column #2): contains number of times a translation of it has been asked for
- malagasy (column #3): its malagasy translation
Part of speech is determined automatically by looking up in a monolingual dictionary CSV file.
"""
POS_DICT = {
"1": "ana",
"2": "mat",
"3": "mpam-ana",
"4": "mpampiankina",
"5": "tamb",
}
def __init__(self, language):
self.language = language
self.monolingual_dictionary_filepath = f"user_data/{language}.csv"
self.bilingual_dictionary_filepath = f"user_data/{language}-mg.csv"
self.dictionary_service = DictionaryServiceManager()
self.monolingual = {}
self.bilingual = {}
def determine_part_of_speech(self, word, malagasy_translation, pos_list):
if "mat" not in pos_list or "ana" not in pos_list:
return pos_list[0]
if (
malagasy_translation.startswith("man")
or malagasy_translation.startswith("mi")
or malagasy_translation.endswith("aina")
or malagasy_translation.endswith("aina")
):
return "mat"
else:
return "ana"
def load_monolingual(self):
print("reading monolingual dictionary")
with open(self.monolingual_dictionary_filepath, "r") as fd:
reader = DictReader(fd)
for row in reader:
if not row["pos_id"]:
continue
english = row["anglisy"].lower()
if english in self.monolingual:
self.monolingual[english].append(self.POS_DICT[row["pos_id"]])
else:
self.monolingual[english] = [self.POS_DICT[row["pos_id"]]]
def load_bilingual(self):
with open(self.bilingual_dictionary_filepath, "r") as fd:
reader = DictReader(fd)
for row in reader:
english = row["english"].lower()
if english and row["malagasy"] and row["hits"]:
self.bilingual[english] = row["malagasy"]
def start(self):
self.load_monolingual()
self.load_bilingual()
for word, malagasy_translation in self.bilingual.items():
print(">>>>", word, "<<<<")
definition = {
"definition": malagasy_translation,
"definition_language": "mg",
}
word = word.strip()
if word not in self.monolingual:
continue
pos_list = self.monolingual[word]
pos = self.determine_part_of_speech(word, malagasy_translation, pos_list)
entry = {
"language": self.language,
"definitions": [definition],
"word": word.strip(),
"part_of_speech": pos,
}
resp = self.dictionary_service.post(
f"entry/{self.language}/create", json=entry
)
if resp.status_code == WordAlreadyExists.status_code:
continue
if __name__ == "__main__":
actions = {"x": Extractor, "l": Loader}
bot = actions[sys.argv[1]](sys.argv[2])
bot.start()