-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper.py
328 lines (261 loc) · 9.18 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
import streamlit as st
from transformers import (MarianMTModel, MarianTokenizer)
import pandas as pd
import requests
import json
import string
from camel_tools.utils.charsets import AR_LETTERS_CHARSET
### ------------------------ Helper Functions ----------------------------- ###
@st.cache(allow_output_mutation=True)
def load_translator(model_name):
"""
Loads the translation model from HuggingFace. It downloads and caches the model
for future use by the app.
Input Parameters
================
model_name => HuggingFace link of the model (typically like this: Helsinki-NLP/opus-mt-ar-en).
Return Parameters
=================
model => Loaded model for translation.
tokenizer => Tokenizer for the translation model.
"""
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
return model, tokenizer
def translate_user_text_input(model_name: str, user_input):
"""
Function translates the user text input from arabic to english using a Helsinki-NLP model
from HuggingFace. The model translates the sentence and then sends it to the frontend for
displaying.
Input Parameters
================
user_input => Arabic sentence for translation (given by the user).
Return Parameters
=================
translated_sentence => Returns the English translation of the arabic sentence back to the front-end.
"""
model, tokenizer = load_translator(model_name)
translated = model.generate(
**tokenizer(user_input, return_tensors="pt", padding=True))
translated_sentence = ""
for t in translated:
translated_sentence = tokenizer.decode(t, skip_special_tokens=True)
return translated_sentence
def is_replacable(token, pos_dict):
"""
Helper function which tells the augmenting functions if we can removes all the uncessary
parts of a sentence and keeps only the NOUNS, VERBS and ADJECTIVES of the sentences for
augmentation. The parts of the sentence are coming from the POS tagger from Farasa API.
Input Parameters
================
token => Token from POS tagging.
pos_dict => Parts of the Speech in the sentence.
Return Parameters
=================
True/False (bool value) => tells the user whether there are POS which can be replaced.
"""
if token in pos_dict:
if bool(set(pos_dict[token].split("+")) & set(['NOUN', 'V', 'ADJ'])):
return True
return False
def spl(text):
"""
Helper function to split the sentence into three parts, first part, second part and augmented
word. This helps to color the augmented words in the front-end.
The augmented word is marked by a star from the augmenting functions and the star is removed
and then sentence is split into 3 to be processed in the frontend.
Input Parameters
================
text => Sentence for splitting.
Return Parameters
=================
rep => Augmented Word (Will be coloured in the frontend).
fhalf => First half of the sentence.
shalf => Second half of the sentence (after the augmented word).
"""
fhalf = []
shalf = []
rep = ""
first = True
for w in text.split():
if "*" in w:
rep = w.replace("*", "").replace("_", " ")
first = False
elif first:
fhalf.append(w)
else:
shalf.append(w)
fhalf = " ".join(fhalf)
shalf = " ".join(shalf)
return rep, fhalf, shalf
def ner(text):
"""
Helper function to check if the sentence contains a named entinty such as country,
famous person, names, places etc. so they are removed as those words contian meanings
and cant be augmented.
Input Parameters
================
text => Input text
Return Parameters
=================
result => Sentece with the named entities marked
"""
url = 'https://farasa.qcri.org/webapi/ner/'
api_key = "KMxvdPGsKHXQAbRXGL"
payload = {'text': text, 'api_key': api_key}
data = requests.post(url, data=payload)
result = json.loads(data.text)
return result['text'][0].split("/")[1]
def models_data(file_name):
"""
Helper function to open a JSON file and return the data to the front-end.
Input Parameter
===============
file_name => File name of the JSON file to be opened
Return Parameter
================
data => Data from the file returned to the frontend
"""
f = open(file_name)
data = json.load(f)
return data
def seperate_punct(text):
"""
Helper function to seperate the punctuations and not the arabic characters. The arabic characters are
imported from the AR_LETTERS_CHARSET.
Input Parameters
================
text => Takes a sentence for seperating the punctuations.
Return Parameters
=================
ret => Returns the sentence without punctuations.
"""
text = text.strip()
text = " ".join(text.split())
ret = ""
letter = str("".join(AR_LETTERS_CHARSET) + string.ascii_letters)
for i, l in enumerate(text):
if not i == len(text) - 1:
if (l in letter or l == "*") and text[i+1] != " " and not text[i+1] in letter:
ret += l + " "
elif not (l in letter or l == "*") and text[i+1] != " " and text[i+1] in letter:
ret += l + " "
else:
ret += l
else:
ret += l
ret = " ".join(ret.split())
return ret
def clean(text):
"""
Helper function which process the arabic text and removes any and all punctutaion
and keeps only the arabic text.
Input Parameters
================
text => Sentence for processing.
Return Parameters
=================
text => Returns the clean text after processing.
"""
punc = """،.:!?؟!:.,''!"#$%&'()*+, -./:;<=>?@[\]^_`{|}~"""
for l in text:
if l in punc and l != " ":
text = text.replace(l, "")
return text
def strip_punc(text):
"""
Helper function to strip the sentence of punctuations and replace it with a space.
Input Parameters
================
text => A sentence to remove punctuations from.
Return Parameter
================
text => A sentence without any punctuations and all of them are replaced with a space.
"""
remove = ""
for l in reversed(text):
if l in AR_LETTERS_CHARSET:
break
elif not l in AR_LETTERS_CHARSET:
remove += l
return text.replace(remove[::-1], "")
@st.cache(allow_output_mutation=True)
def convert_df_to_csv(df):
"""
Helper funciton to convert pandas dataframe to CSV with encoding 'utf-8-sig'. This
particular encoding allows for arabic characters to be displayed properly in the
CSV file.
Input Parameters
================
df => Pandas Dataframe.
Output Parameters
=================
Return dataframe to CSV with encoding and removed index numbers.
"""
return df.to_csv(index=False).encode('utf-8-sig')
def show_selected_models(data):
"""
Helper function to return a dict with selected models choosen
by the user.
Input Parameters
================
data => JSON object containing user selected models.
Output Parameters
=================
selected_models => Dict containing all the selected models.
"""
available_models = [
'arabert',
'qarib-bert',
'xlm-roberta-bert',
'arabart',
'camelbert',
'bert-large-arabic',
'ubc-arbert',
'ubc-marbertv2',
'araelectra',
'aragpt2',
'aravec',
'back-translation',
'm2m'
]
selected_models = []
for i in range(len(data)):
if data[available_models[i] == 1]:
selected_models.append(available_models[i])
return selected_models
def download_all_outputs(list_of_dataframes):
"""
Helper function to download all the concated dataframes from the selected models to
a CSV file. It also displays the button to the user.
Input Parameters
================
list_of_dataframes => List of dataframes containing the augmented sentences
"""
df = pd.concat(list_of_dataframes, axis=0)
csv_file = convert_df_to_csv(df)
st.download_button(
label="Download CSV",
data=csv_file,
file_name='all-outputs.csv',
mime='text/csv',
)
def get_df_data(sentences_list, similarity_list):
"""
Helper function to get a new dataframe after conjoining the sentences list
and similarity list.
There is a check to see if the sentences_list > 0 so that empty sentences
list are not processed.
Input Parameters
================
sentence_list => A list of augmented sentences
similarity_list => List of cosine similarities for each sentence
Output Parameters
=================
df => Resulting dataframe after adding both the Sentences and Similarities Scores
"""
if len(sentences_list) > 0:
data = list(zip(sentences_list, similarity_list))
df = pd.DataFrame(data, columns=['Sentences', 'Similarity Score'])
return df
### ----------------- End of Helper Functions ----------------------------- ###